Update bcachefs sources to 99750eab4d bcachefs: Persist stripe blocks_used

This commit is contained in:
Kent Overstreet 2019-01-23 15:49:44 -05:00
parent 1c50d258e3
commit 35fca2f044
37 changed files with 920 additions and 405 deletions

View File

@ -1 +1 @@
bcca1c557b1897ecc3aeb1f89ab91865487d91ab 99750eab4d583132cf61f071082c7cf21f5295c0

0
include/asm/page.h Normal file
View File

View File

@ -37,6 +37,7 @@ typedef struct {
#define xchg_acquire(p, v) uatomic_xchg(p, v) #define xchg_acquire(p, v) uatomic_xchg(p, v)
#define cmpxchg(p, old, new) uatomic_cmpxchg(p, old, new) #define cmpxchg(p, old, new) uatomic_cmpxchg(p, old, new)
#define cmpxchg_acquire(p, old, new) uatomic_cmpxchg(p, old, new) #define cmpxchg_acquire(p, old, new) uatomic_cmpxchg(p, old, new)
#define cmpxchg_release(p, old, new) uatomic_cmpxchg(p, old, new)
#define smp_mb__before_atomic() cmm_smp_mb__before_uatomic_add() #define smp_mb__before_atomic() cmm_smp_mb__before_uatomic_add()
#define smp_mb__after_atomic() cmm_smp_mb__after_uatomic_add() #define smp_mb__after_atomic() cmm_smp_mb__after_uatomic_add()
@ -77,6 +78,16 @@ typedef struct {
__old; \ __old; \
}) })
#define cmpxchg_release(p, old, new) \
({ \
typeof(*(p)) __old = (old); \
\
__atomic_compare_exchange_n((p), &__old, new, false, \
__ATOMIC_RELEASE, \
__ATOMIC_RELEASE); \
__old; \
})
#define smp_mb__before_atomic() __atomic_thread_fence(__ATOMIC_SEQ_CST) #define smp_mb__before_atomic() __atomic_thread_fence(__ATOMIC_SEQ_CST)
#define smp_mb__after_atomic() __atomic_thread_fence(__ATOMIC_SEQ_CST) #define smp_mb__after_atomic() __atomic_thread_fence(__ATOMIC_SEQ_CST)
#define smp_wmb() __atomic_thread_fence(__ATOMIC_SEQ_CST) #define smp_wmb() __atomic_thread_fence(__ATOMIC_SEQ_CST)

View File

@ -1,34 +1,60 @@
#ifndef _LINUX_GENERIC_RADIX_TREE_H #ifndef _LINUX_GENERIC_RADIX_TREE_H
#define _LINUX_GENERIC_RADIX_TREE_H #define _LINUX_GENERIC_RADIX_TREE_H
/* /**
* Generic radix trees/sparse arrays: * DOC: Generic radix trees/sparse arrays:
* *
* A generic radix tree has all nodes of size PAGE_SIZE - both leaves and * Very simple and minimalistic, supporting arbitrary size entries up to
* interior nodes. * PAGE_SIZE.
*
* A genradix is defined with the type it will store, like so:
*
* static GENRADIX(struct foo) foo_genradix;
*
* The main operations are:
*
* - genradix_init(radix) - initialize an empty genradix
*
* - genradix_free(radix) - free all memory owned by the genradix and
* reinitialize it
*
* - genradix_ptr(radix, idx) - gets a pointer to the entry at idx, returning
* NULL if that entry does not exist
*
* - genradix_ptr_alloc(radix, idx, gfp) - gets a pointer to an entry,
* allocating it if necessary
*
* - genradix_for_each(radix, iter, p) - iterate over each entry in a genradix
*
* The radix tree allocates one page of entries at a time, so entries may exist
* that were never explicitly allocated - they will be initialized to all
* zeroes.
*
* Internally, a genradix is just a radix tree of pages, and indexing works in
* terms of byte offsets. The wrappers in this header file use sizeof on the
* type the radix contains to calculate a byte offset from the index - see
* __idx_to_offset.
*/ */
#include <asm/page.h>
#include <linux/bug.h> #include <linux/bug.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/log2.h> #include <linux/log2.h>
struct genradix_node; struct genradix_root;
struct __genradix { struct __genradix {
struct genradix_node *root; struct genradix_root __rcu *root;
size_t depth;
}; };
/* /*
* NOTE: currently, sizeof(_type) must be a power of two and not larger than * NOTE: currently, sizeof(_type) must not be larger than PAGE_SIZE:
* PAGE_SIZE:
*/ */
#define __GENRADIX_INITIALIZER \ #define __GENRADIX_INITIALIZER \
{ \ { \
.tree = { \ .tree = { \
.root = NULL, \ .root = NULL, \
.depth = 0, \
} \ } \
} }
@ -49,6 +75,12 @@ struct { \
#define DEFINE_GENRADIX(_name, _type) \ #define DEFINE_GENRADIX(_name, _type) \
GENRADIX(_type) _name = __GENRADIX_INITIALIZER GENRADIX(_type) _name = __GENRADIX_INITIALIZER
/**
* genradix_init - initialize a genradix
* @_radix: genradix to initialize
*
* Does not fail
*/
#define genradix_init(_radix) \ #define genradix_init(_radix) \
do { \ do { \
*(_radix) = (typeof(*_radix)) __GENRADIX_INITIALIZER; \ *(_radix) = (typeof(*_radix)) __GENRADIX_INITIALIZER; \
@ -56,11 +88,20 @@ do { \
void __genradix_free(struct __genradix *); void __genradix_free(struct __genradix *);
/**
* genradix_free: free all memory owned by a genradix
* @_radix: the genradix to free
*
* After freeing, @_radix will be reinitialized and empty
*/
#define genradix_free(_radix) __genradix_free(&(_radix)->tree) #define genradix_free(_radix) __genradix_free(&(_radix)->tree)
static inline size_t __idx_to_offset(size_t idx, size_t obj_size) static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
{ {
BUILD_BUG_ON(obj_size > PAGE_SIZE); if (__builtin_constant_p(obj_size))
BUILD_BUG_ON(obj_size > PAGE_SIZE);
else
BUG_ON(obj_size > PAGE_SIZE);
if (!is_power_of_2(obj_size)) { if (!is_power_of_2(obj_size)) {
size_t objs_per_page = PAGE_SIZE / obj_size; size_t objs_per_page = PAGE_SIZE / obj_size;
@ -79,7 +120,13 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
void *__genradix_ptr(struct __genradix *, size_t); void *__genradix_ptr(struct __genradix *, size_t);
/* Returns a pointer to element at @_idx */ /**
* genradix_ptr - get a pointer to a genradix entry
* @_radix: genradix to access
* @_idx: index to fetch
*
* Returns a pointer to entry at @_idx, or NULL if that entry does not exist.
*/
#define genradix_ptr(_radix, _idx) \ #define genradix_ptr(_radix, _idx) \
(__genradix_cast(_radix) \ (__genradix_cast(_radix) \
__genradix_ptr(&(_radix)->tree, \ __genradix_ptr(&(_radix)->tree, \
@ -87,7 +134,15 @@ void *__genradix_ptr(struct __genradix *, size_t);
void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t); void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
/* Returns a pointer to element at @_idx, allocating it if necessary */ /**
* genradix_ptr_alloc - get a pointer to a genradix entry, allocating it
* if necessary
* @_radix: genradix to access
* @_idx: index to fetch
* @_gfp: gfp mask
*
* Returns a pointer to entry at @_idx, or NULL on allocation failure
*/
#define genradix_ptr_alloc(_radix, _idx, _gfp) \ #define genradix_ptr_alloc(_radix, _idx, _gfp) \
(__genradix_cast(_radix) \ (__genradix_cast(_radix) \
__genradix_ptr_alloc(&(_radix)->tree, \ __genradix_ptr_alloc(&(_radix)->tree, \
@ -99,6 +154,11 @@ struct genradix_iter {
size_t pos; size_t pos;
}; };
/**
* genradix_iter_init - initialize a genradix_iter
* @_radix: genradix that will be iterated over
* @_idx: index to start iterating from
*/
#define genradix_iter_init(_radix, _idx) \ #define genradix_iter_init(_radix, _idx) \
((struct genradix_iter) { \ ((struct genradix_iter) { \
.pos = (_idx), \ .pos = (_idx), \
@ -107,6 +167,14 @@ struct genradix_iter {
void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t); void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
/**
* genradix_iter_peek - get first entry at or above iterator's current
* position
* @_iter: a genradix_iter
* @_radix: genradix being iterated over
*
* If no more entries exist at or above @_iter's current position, returns NULL
*/
#define genradix_iter_peek(_iter, _radix) \ #define genradix_iter_peek(_iter, _radix) \
(__genradix_cast(_radix) \ (__genradix_cast(_radix) \
__genradix_iter_peek(_iter, &(_radix)->tree, \ __genradix_iter_peek(_iter, &(_radix)->tree, \
@ -127,4 +195,37 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
#define genradix_iter_advance(_iter, _radix) \ #define genradix_iter_advance(_iter, _radix) \
__genradix_iter_advance(_iter, __genradix_obj_size(_radix)) __genradix_iter_advance(_iter, __genradix_obj_size(_radix))
#define genradix_for_each_from(_radix, _iter, _p, _start) \
for (_iter = genradix_iter_init(_radix, _start); \
(_p = genradix_iter_peek(&_iter, _radix)) != NULL; \
genradix_iter_advance(&_iter, _radix))
/**
* genradix_for_each - iterate over entry in a genradix
* @_radix: genradix to iterate over
* @_iter: a genradix_iter to track current position
* @_p: pointer to genradix entry type
*
* On every iteration, @_p will point to the current entry, and @_iter.pos
* will be the current entry's index.
*/
#define genradix_for_each(_radix, _iter, _p) \
genradix_for_each_from(_radix, _iter, _p, 0)
int __genradix_prealloc(struct __genradix *, size_t, gfp_t);
/**
* genradix_prealloc - preallocate entries in a generic radix tree
* @_radix: genradix to preallocate
* @_nr: number of entries to preallocate
* @_gfp: gfp mask
*
* Returns 0 on success, -ENOMEM on failure
*/
#define genradix_prealloc(_radix, _nr, _gfp) \
__genradix_prealloc(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _nr + 1),\
_gfp)
#endif /* _LINUX_GENERIC_RADIX_TREE_H */ #endif /* _LINUX_GENERIC_RADIX_TREE_H */

View File

@ -249,6 +249,9 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
bch2_alloc_read_key(c, bkey_i_to_s_c(k)); bch2_alloc_read_key(c, bkey_i_to_s_c(k));
} }
for_each_member_device(ca, c, i)
bch2_dev_usage_from_buckets(c, ca);
mutex_lock(&c->bucket_clock[READ].lock); mutex_lock(&c->bucket_clock[READ].lock);
for_each_member_device(ca, c, i) { for_each_member_device(ca, c, i) {
down_read(&ca->bucket_lock); down_read(&ca->bucket_lock);
@ -280,35 +283,51 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
#endif #endif
struct bkey_i_alloc *a = bkey_alloc_init(&alloc_key.k); struct bkey_i_alloc *a = bkey_alloc_init(&alloc_key.k);
struct bucket *g; struct bucket *g;
struct bucket_mark m; struct bucket_mark m, new;
int ret; int ret;
BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
a->k.p = POS(ca->dev_idx, b); a->k.p = POS(ca->dev_idx, b);
bch2_btree_iter_set_pos(iter, a->k.p);
ret = bch2_btree_iter_traverse(iter);
if (ret)
return ret;
percpu_down_read_preempt_disable(&c->mark_lock); percpu_down_read_preempt_disable(&c->mark_lock);
g = bucket(ca, b); g = bucket(ca, b);
m = bucket_cmpxchg(g, m, m.dirty = false); m = READ_ONCE(g->mark);
if (!m.dirty) {
percpu_up_read_preempt_enable(&c->mark_lock);
return 0;
}
__alloc_write_key(a, g, m); __alloc_write_key(a, g, m);
percpu_up_read_preempt_enable(&c->mark_lock); percpu_up_read_preempt_enable(&c->mark_lock);
bch2_btree_iter_cond_resched(iter); bch2_btree_iter_cond_resched(iter);
bch2_btree_iter_set_pos(iter, a->k.p);
ret = bch2_btree_insert_at(c, NULL, journal_seq, ret = bch2_btree_insert_at(c, NULL, journal_seq,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL| BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE| BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE| BTREE_INSERT_USE_ALLOC_RESERVE|
flags, flags,
BTREE_INSERT_ENTRY(iter, &a->k_i)); BTREE_INSERT_ENTRY(iter, &a->k_i));
if (ret)
return ret;
if (!ret && ca->buckets_written) new = m;
new.dirty = false;
atomic64_cmpxchg(&g->_mark.v, m.v.counter, new.v.counter);
if (ca->buckets_written)
set_bit(b, ca->buckets_written); set_bit(b, ca->buckets_written);
return ret; return 0;
} }
int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
@ -898,10 +917,19 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t
for (i = 0; i < RESERVE_NR; i++) for (i = 0; i < RESERVE_NR; i++)
if (fifo_push(&ca->free[i], bucket)) { if (fifo_push(&ca->free[i], bucket)) {
fifo_pop(&ca->free_inc, bucket); fifo_pop(&ca->free_inc, bucket);
closure_wake_up(&c->freelist_wait); closure_wake_up(&c->freelist_wait);
ca->allocator_blocked_full = false;
spin_unlock(&c->freelist_lock); spin_unlock(&c->freelist_lock);
goto out; goto out;
} }
if (!ca->allocator_blocked_full) {
ca->allocator_blocked_full = true;
closure_wake_up(&c->freelist_wait);
}
spin_unlock(&c->freelist_lock); spin_unlock(&c->freelist_lock);
if ((current->flags & PF_KTHREAD) && if ((current->flags & PF_KTHREAD) &&
@ -1226,6 +1254,11 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
set_bit(ca->dev_idx, c->rw_devs[i].d); set_bit(ca->dev_idx, c->rw_devs[i].d);
} }
void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
{
closure_wait_event(&c->freelist_wait, ca->allocator_blocked_full);
}
/* stop allocator thread: */ /* stop allocator thread: */
void bch2_dev_allocator_stop(struct bch_dev *ca) void bch2_dev_allocator_stop(struct bch_dev *ca)
{ {
@ -1333,6 +1366,24 @@ static void allocator_start_issue_discards(struct bch_fs *c)
ca->mi.bucket_size, GFP_NOIO, 0); ca->mi.bucket_size, GFP_NOIO, 0);
} }
static int resize_free_inc(struct bch_dev *ca)
{
alloc_fifo free_inc;
if (!fifo_full(&ca->free_inc))
return 0;
if (!init_fifo(&free_inc,
ca->free_inc.size * 2,
GFP_KERNEL))
return -ENOMEM;
fifo_move(&free_inc, &ca->free_inc);
swap(free_inc, ca->free_inc);
free_fifo(&free_inc);
return 0;
}
static int __bch2_fs_allocator_start(struct bch_fs *c) static int __bch2_fs_allocator_start(struct bch_fs *c)
{ {
struct bch_dev *ca; struct bch_dev *ca;
@ -1408,6 +1459,12 @@ not_enough:
while (!fifo_full(&ca->free[RESERVE_BTREE]) && while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
(bu = next_alloc_bucket(ca)) >= 0) { (bu = next_alloc_bucket(ca)) >= 0) {
ret = resize_free_inc(ca);
if (ret) {
percpu_ref_put(&ca->io_ref);
return ret;
}
bch2_invalidate_one_bucket(c, ca, bu, bch2_invalidate_one_bucket(c, ca, bu,
&journal_seq); &journal_seq);

View File

@ -51,6 +51,7 @@ void bch2_recalc_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_stop(struct bch_dev *); void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *); int bch2_dev_allocator_start(struct bch_dev *);

View File

@ -106,6 +106,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr),
false, gc_pos_alloc(c, ob), 0); false, gc_pos_alloc(c, ob), 0);
ob->valid = false; ob->valid = false;
ob->type = 0;
spin_unlock(&ob->lock); spin_unlock(&ob->lock);
percpu_up_read_preempt_enable(&c->mark_lock); percpu_up_read_preempt_enable(&c->mark_lock);
@ -141,6 +142,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
ob = c->open_buckets + c->open_buckets_freelist; ob = c->open_buckets + c->open_buckets_freelist;
c->open_buckets_freelist = ob->freelist; c->open_buckets_freelist = ob->freelist;
atomic_set(&ob->pin, 1); atomic_set(&ob->pin, 1);
ob->type = 0;
c->open_buckets_nr_free--; c->open_buckets_nr_free--;
return ob; return ob;
@ -209,9 +211,9 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
case RESERVE_ALLOC: case RESERVE_ALLOC:
return 0; return 0;
case RESERVE_BTREE: case RESERVE_BTREE:
return BTREE_NODE_RESERVE / 2; return BTREE_NODE_OPEN_BUCKET_RESERVE;
default: default:
return BTREE_NODE_RESERVE; return BTREE_NODE_OPEN_BUCKET_RESERVE * 2;
} }
} }
@ -837,15 +839,17 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
{ {
struct write_point *wp; struct write_point *wp;
struct open_bucket *ob; struct open_bucket *ob;
unsigned nr_effective = 0; struct open_buckets ptrs;
struct open_buckets ptrs = { .nr = 0 }; unsigned nr_effective, write_points_nr;
bool have_cache = false; bool have_cache;
unsigned write_points_nr; int ret, i;
int ret = 0, i;
BUG_ON(!nr_replicas || !nr_replicas_required); BUG_ON(!nr_replicas || !nr_replicas_required);
retry: retry:
ptrs.nr = 0;
nr_effective = 0;
write_points_nr = c->write_points_nr; write_points_nr = c->write_points_nr;
have_cache = false;
wp = writepoint_find(c, write_point.v); wp = writepoint_find(c, write_point.v);

View File

@ -85,6 +85,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
unsigned i; unsigned i;
open_bucket_for_each(c, &wp->ptrs, ob, i) { open_bucket_for_each(c, &wp->ptrs, ob, i) {
ob->type = wp->type;
atomic_inc(&ob->pin); atomic_inc(&ob->pin);
ob_push(c, ptrs, ob); ob_push(c, ptrs, ob);
} }

View File

@ -55,9 +55,10 @@ struct open_bucket {
spinlock_t lock; spinlock_t lock;
atomic_t pin; atomic_t pin;
u8 freelist; u8 freelist;
bool valid;
bool on_partial_list;
u8 ec_idx; u8 ec_idx;
u8 type;
unsigned valid:1;
unsigned on_partial_list:1;
unsigned sectors_free; unsigned sectors_free;
struct bch_extent_ptr ptr; struct bch_extent_ptr ptr;
struct ec_stripe_new *ec; struct ec_stripe_new *ec;

View File

@ -330,6 +330,8 @@ enum bch_time_stats {
/* Size of the freelist we allocate btree nodes from: */ /* Size of the freelist we allocate btree nodes from: */
#define BTREE_NODE_RESERVE BTREE_RESERVE_MAX #define BTREE_NODE_RESERVE BTREE_RESERVE_MAX
#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
struct btree; struct btree;
enum gc_phase { enum gc_phase {
@ -426,7 +428,13 @@ struct bch_dev {
size_t inc_gen_needs_gc; size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc; size_t inc_gen_really_needs_gc;
/*
* XXX: this should be an enum for allocator state, so as to include
* error state
*/
bool allocator_blocked; bool allocator_blocked;
bool allocator_blocked_full;
alloc_heap alloc_heap; alloc_heap alloc_heap;
@ -597,6 +605,7 @@ struct bch_fs {
struct workqueue_struct *wq; struct workqueue_struct *wq;
/* copygc needs its own workqueue for index updates.. */ /* copygc needs its own workqueue for index updates.. */
struct workqueue_struct *copygc_wq; struct workqueue_struct *copygc_wq;
struct workqueue_struct *journal_reclaim_wq;
/* ALLOCATION */ /* ALLOCATION */
struct delayed_work pd_controllers_update; struct delayed_work pd_controllers_update;

View File

@ -1010,11 +1010,8 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
nr_key_bits -= 64; nr_key_bits -= 64;
} }
if (l_v != r_v) if (!nr_key_bits || l_v != r_v)
return l_v < r_v ? -1 : 1; break;
if (!nr_key_bits)
return 0;
l = next_word(l); l = next_word(l);
r = next_word(r); r = next_word(r);
@ -1022,6 +1019,8 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
l_v = *l; l_v = *l;
r_v = *r; r_v = *r;
} }
return (l_v > r_v) - (l_v < r_v);
} }
#endif #endif

View File

@ -483,31 +483,6 @@ static void bch2_gc_free(struct bch_fs *c)
percpu_up_write(&c->mark_lock); percpu_up_write(&c->mark_lock);
} }
/*
* Accumulate percpu counters onto one cpu's copy - only valid when access
* against any percpu counter is guarded against
*/
static u64 *acc_percpu_u64s(u64 __percpu *p, unsigned nr)
{
u64 *ret;
int cpu;
preempt_disable();
ret = this_cpu_ptr(p);
preempt_enable();
for_each_possible_cpu(cpu) {
u64 *i = per_cpu_ptr(p, cpu);
if (i != ret) {
acc_u64s(ret, i, nr);
memset(i, 0, nr * sizeof(u64));
}
}
return ret;
}
static void bch2_gc_done_nocheck(struct bch_fs *c) static void bch2_gc_done_nocheck(struct bch_fs *c)
{ {
struct bch_dev *ca; struct bch_dev *ca;
@ -543,9 +518,9 @@ static void bch2_gc_done_nocheck(struct bch_fs *c)
for_each_member_device(ca, c, i) { for_each_member_device(ca, c, i) {
unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64); unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
struct bch_dev_usage *dst = (void *) struct bch_dev_usage *dst = (void *)
acc_percpu_u64s((void *) ca->usage[0], nr); bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
struct bch_dev_usage *src = (void *) struct bch_dev_usage *src = (void *)
acc_percpu_u64s((void *) ca->usage[1], nr); bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
*dst = *src; *dst = *src;
} }
@ -554,9 +529,9 @@ static void bch2_gc_done_nocheck(struct bch_fs *c)
unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) + unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
c->replicas.nr; c->replicas.nr;
struct bch_fs_usage *dst = (void *) struct bch_fs_usage *dst = (void *)
acc_percpu_u64s((void *) c->usage[0], nr); bch2_acc_percpu_u64s((void *) c->usage[0], nr);
struct bch_fs_usage *src = (void *) struct bch_fs_usage *src = (void *)
acc_percpu_u64s((void *) c->usage[1], nr); bch2_acc_percpu_u64s((void *) c->usage[1], nr);
memcpy(&dst->s.gc_start[0], memcpy(&dst->s.gc_start[0],
&src->s.gc_start[0], &src->s.gc_start[0],
@ -582,6 +557,7 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
dst_iter.pos, ##__VA_ARGS__, \ dst_iter.pos, ##__VA_ARGS__, \
dst->_f, src->_f); \ dst->_f, src->_f); \
dst->_f = src->_f; \ dst->_f = src->_f; \
dst->dirty = true; \
} }
#define copy_bucket_field(_f) \ #define copy_bucket_field(_f) \
if (dst->b[b].mark._f != src->b[b].mark._f) { \ if (dst->b[b].mark._f != src->b[b].mark._f) { \
@ -612,16 +588,18 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) && while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
(src = genradix_iter_peek(&src_iter, &c->stripes[1]))) { (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
BUG_ON(src_iter.pos != dst_iter.pos);
copy_stripe_field(alive, "alive"); copy_stripe_field(alive, "alive");
copy_stripe_field(sectors, "sectors"); copy_stripe_field(sectors, "sectors");
copy_stripe_field(algorithm, "algorithm"); copy_stripe_field(algorithm, "algorithm");
copy_stripe_field(nr_blocks, "nr_blocks"); copy_stripe_field(nr_blocks, "nr_blocks");
copy_stripe_field(nr_redundant, "nr_redundant"); copy_stripe_field(nr_redundant, "nr_redundant");
copy_stripe_field(blocks_nonempty.counter, copy_stripe_field(blocks_nonempty,
"blocks_nonempty"); "blocks_nonempty");
for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
copy_stripe_field(block_sectors[i].counter, copy_stripe_field(block_sectors[i],
"block_sectors[%u]", i); "block_sectors[%u]", i);
if (dst->alive) if (dst->alive)
@ -656,9 +634,9 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
for_each_member_device(ca, c, i) { for_each_member_device(ca, c, i) {
unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64); unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
struct bch_dev_usage *dst = (void *) struct bch_dev_usage *dst = (void *)
acc_percpu_u64s((void *) ca->usage[0], nr); bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
struct bch_dev_usage *src = (void *) struct bch_dev_usage *src = (void *)
acc_percpu_u64s((void *) ca->usage[1], nr); bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
unsigned b; unsigned b;
for (b = 0; b < BCH_DATA_NR; b++) for (b = 0; b < BCH_DATA_NR; b++)
@ -678,9 +656,9 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) + unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
c->replicas.nr; c->replicas.nr;
struct bch_fs_usage *dst = (void *) struct bch_fs_usage *dst = (void *)
acc_percpu_u64s((void *) c->usage[0], nr); bch2_acc_percpu_u64s((void *) c->usage[0], nr);
struct bch_fs_usage *src = (void *) struct bch_fs_usage *src = (void *)
acc_percpu_u64s((void *) c->usage[1], nr); bch2_acc_percpu_u64s((void *) c->usage[1], nr);
copy_fs_field(s.hidden, "hidden"); copy_fs_field(s.hidden, "hidden");
copy_fs_field(s.data, "data"); copy_fs_field(s.data, "data");

View File

@ -109,7 +109,7 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
do { do {
seq = read_seqcount_begin(&c->gc_pos_lock); seq = read_seqcount_begin(&c->gc_pos_lock);
ret = gc_pos_cmp(pos, c->gc_pos) <= 0; ret = gc_pos_cmp(pos, c->gc_pos) < 0;
} while (read_seqcount_retry(&c->gc_pos_lock, seq)); } while (read_seqcount_retry(&c->gc_pos_lock, seq));
return ret; return ret;

View File

@ -77,6 +77,7 @@ enum {
__BTREE_INSERT_ATOMIC, __BTREE_INSERT_ATOMIC,
__BTREE_INSERT_NOUNLOCK, __BTREE_INSERT_NOUNLOCK,
__BTREE_INSERT_NOFAIL, __BTREE_INSERT_NOFAIL,
__BTREE_INSERT_NOCHECK_RW,
__BTREE_INSERT_USE_RESERVE, __BTREE_INSERT_USE_RESERVE,
__BTREE_INSERT_USE_ALLOC_RESERVE, __BTREE_INSERT_USE_ALLOC_RESERVE,
__BTREE_INSERT_JOURNAL_REPLAY, __BTREE_INSERT_JOURNAL_REPLAY,
@ -100,6 +101,8 @@ enum {
/* Don't check for -ENOSPC: */ /* Don't check for -ENOSPC: */
#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL) #define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL)
#define BTREE_INSERT_NOCHECK_RW (1 << __BTREE_INSERT_NOCHECK_RW)
/* for copygc, or when merging btree nodes */ /* for copygc, or when merging btree nodes */
#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE) #define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE)
#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE) #define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE)

View File

@ -628,7 +628,8 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
trans_for_each_entry(trans, i) trans_for_each_entry(trans, i)
btree_insert_entry_checks(c, i); btree_insert_entry_checks(c, i);
if (unlikely(!percpu_ref_tryget(&c->writes))) if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
!percpu_ref_tryget(&c->writes)))
return -EROFS; return -EROFS;
retry: retry:
trans_for_each_iter(trans, i) { trans_for_each_iter(trans, i) {
@ -658,7 +659,8 @@ retry:
trans_for_each_iter(trans, i) trans_for_each_iter(trans, i)
bch2_btree_iter_downgrade(i->iter); bch2_btree_iter_downgrade(i->iter);
out: out:
percpu_ref_put(&c->writes); if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
percpu_ref_put(&c->writes);
/* make sure we didn't drop or screw up locks: */ /* make sure we didn't drop or screw up locks: */
trans_for_each_iter(trans, i) { trans_for_each_iter(trans, i) {

View File

@ -151,7 +151,6 @@ retry:
acc_u64s_percpu((u64 *) ret, acc_u64s_percpu((u64 *) ret,
(u64 __percpu *) c->usage[0], (u64 __percpu *) c->usage[0],
sizeof(*ret) / sizeof(u64) + nr); sizeof(*ret) / sizeof(u64) + nr);
percpu_up_read_preempt_enable(&c->mark_lock);
return ret; return ret;
} }
@ -223,13 +222,14 @@ static bool bucket_became_unavailable(struct bucket_mark old,
!is_available_bucket(new); !is_available_bucket(new);
} }
void bch2_fs_usage_apply(struct bch_fs *c, int bch2_fs_usage_apply(struct bch_fs *c,
struct bch_fs_usage *fs_usage, struct bch_fs_usage *fs_usage,
struct disk_reservation *disk_res, struct disk_reservation *disk_res,
struct gc_pos gc_pos) struct gc_pos gc_pos)
{ {
s64 added = fs_usage->s.data + fs_usage->s.reserved; s64 added = fs_usage->s.data + fs_usage->s.reserved;
s64 should_not_have_added; s64 should_not_have_added;
int ret = 0;
percpu_rwsem_assert_held(&c->mark_lock); percpu_rwsem_assert_held(&c->mark_lock);
@ -242,6 +242,7 @@ void bch2_fs_usage_apply(struct bch_fs *c,
"disk usage increased without a reservation")) { "disk usage increased without a reservation")) {
atomic64_sub(should_not_have_added, &c->sectors_available); atomic64_sub(should_not_have_added, &c->sectors_available);
added -= should_not_have_added; added -= should_not_have_added;
ret = -1;
} }
if (added > 0) { if (added > 0) {
@ -259,6 +260,8 @@ void bch2_fs_usage_apply(struct bch_fs *c,
(u64 *) fs_usage, (u64 *) fs_usage,
sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr); sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
} }
return ret;
} }
static inline void account_bucket(struct bch_fs_usage *fs_usage, static inline void account_bucket(struct bch_fs_usage *fs_usage,
@ -363,10 +366,7 @@ static inline void update_cached_sectors(struct bch_fs *c,
{ {
struct bch_replicas_padded r; struct bch_replicas_padded r;
r.e.data_type = BCH_DATA_CACHED; bch2_replicas_entry_cached(&r.e, dev);
r.e.nr_devs = 1;
r.e.nr_required = 1;
r.e.devs[0] = dev;
update_replicas(c, fs_usage, &r.e, sectors); update_replicas(c, fs_usage, &r.e, sectors);
} }
@ -382,7 +382,8 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
*old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ *old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
BUG_ON(!is_available_bucket(new)); BUG_ON(!is_available_bucket(new));
new.owned_by_allocator = 1; new.owned_by_allocator = true;
new.dirty = true;
new.data_type = 0; new.data_type = 0;
new.cached_sectors = 0; new.cached_sectors = 0;
new.dirty_sectors = 0; new.dirty_sectors = 0;
@ -455,6 +456,7 @@ static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
type != BCH_DATA_JOURNAL); type != BCH_DATA_JOURNAL);
bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
new.dirty = true;
new.data_type = type; new.data_type = type;
checked_add(new.dirty_sectors, sectors); checked_add(new.dirty_sectors, sectors);
})); }));
@ -480,13 +482,14 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
true); true);
} else { } else {
struct bucket *g; struct bucket *g;
struct bucket_mark old, new; struct bucket_mark new;
rcu_read_lock(); rcu_read_lock();
g = bucket(ca, b); g = bucket(ca, b);
old = bucket_cmpxchg(g, new, ({ bucket_cmpxchg(g, new, ({
new.data_type = type; new.dirty = true;
new.data_type = type;
checked_add(new.dirty_sectors, sectors); checked_add(new.dirty_sectors, sectors);
})); }));
@ -537,6 +540,8 @@ static void bch2_mark_pointer(struct bch_fs *c,
do { do {
new.v.counter = old.v.counter = v; new.v.counter = old.v.counter = v;
new.dirty = true;
/* /*
* Check this after reading bucket mark to guard against * Check this after reading bucket mark to guard against
* the allocator invalidating a bucket after we've already * the allocator invalidating a bucket after we've already
@ -591,9 +596,14 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
int blocks_nonempty_delta; int blocks_nonempty_delta;
s64 parity_sectors; s64 parity_sectors;
BUG_ON(!sectors);
m = genradix_ptr(&c->stripes[gc], p.idx); m = genradix_ptr(&c->stripes[gc], p.idx);
spin_lock(&c->ec_stripes_heap_lock);
if (!m || !m->alive) { if (!m || !m->alive) {
spin_unlock(&c->ec_stripes_heap_lock);
bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
(u64) p.idx); (u64) p.idx);
return -1; return -1;
@ -609,19 +619,21 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
parity_sectors = -parity_sectors; parity_sectors = -parity_sectors;
sectors += parity_sectors; sectors += parity_sectors;
new = atomic_add_return(sectors, &m->block_sectors[p.block]); old = m->block_sectors[p.block];
old = new - sectors; m->block_sectors[p.block] += sectors;
new = m->block_sectors[p.block];
blocks_nonempty_delta = (int) !!new - (int) !!old; blocks_nonempty_delta = (int) !!new - (int) !!old;
if (!blocks_nonempty_delta) if (blocks_nonempty_delta) {
return 0; m->blocks_nonempty += blocks_nonempty_delta;
atomic_add(blocks_nonempty_delta, &m->blocks_nonempty); if (!gc)
bch2_stripes_heap_update(c, m, p.idx);
}
BUG_ON(atomic_read(&m->blocks_nonempty) < 0); m->dirty = true;
if (!gc) spin_unlock(&c->ec_stripes_heap_lock);
bch2_stripes_heap_update(c, m, p.idx);
update_replicas(c, fs_usage, &m->r.e, sectors); update_replicas(c, fs_usage, &m->r.e, sectors);
@ -629,8 +641,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
} }
static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
s64 sectors, s64 sectors, enum bch_data_type data_type,
enum bch_data_type data_type,
struct bch_fs_usage *fs_usage, struct bch_fs_usage *fs_usage,
unsigned journal_seq, unsigned flags, unsigned journal_seq, unsigned flags,
bool gc) bool gc)
@ -701,14 +712,13 @@ static void bucket_set_stripe(struct bch_fs *c,
BUG_ON(ptr_stale(ca, ptr)); BUG_ON(ptr_stale(ca, ptr));
old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
new.dirty = true;
new.stripe = enabled; new.stripe = enabled;
if (journal_seq) { if (journal_seq) {
new.journal_seq_valid = 1; new.journal_seq_valid = 1;
new.journal_seq = journal_seq; new.journal_seq = journal_seq;
} }
})); }));
BUG_ON(old.stripe == enabled);
} }
} }
@ -723,22 +733,19 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
struct stripe *m = genradix_ptr(&c->stripes[gc], idx); struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
unsigned i; unsigned i;
spin_lock(&c->ec_stripes_heap_lock);
if (!m || (!inserting && !m->alive)) { if (!m || (!inserting && !m->alive)) {
spin_unlock(&c->ec_stripes_heap_lock);
bch_err_ratelimited(c, "error marking nonexistent stripe %zu", bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
idx); idx);
return -1; return -1;
} }
if (inserting && m->alive) { if (m->alive)
bch_err_ratelimited(c, "error marking stripe %zu: already exists", bch2_stripes_heap_del(c, m, idx);
idx);
return -1;
}
BUG_ON(atomic_read(&m->blocks_nonempty)); memset(m, 0, sizeof(*m));
for (i = 0; i < EC_STRIPE_MAX; i++)
BUG_ON(atomic_read(&m->block_sectors[i]));
if (inserting) { if (inserting) {
m->sectors = le16_to_cpu(s.v->sectors); m->sectors = le16_to_cpu(s.v->sectors);
@ -754,7 +761,6 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
for (i = 0; i < s.v->nr_blocks; i++) for (i = 0; i < s.v->nr_blocks; i++)
m->r.e.devs[i] = s.v->ptrs[i].dev; m->r.e.devs[i] = s.v->ptrs[i].dev;
}
/* /*
* XXX: account for stripes somehow here * XXX: account for stripes somehow here
@ -763,15 +769,23 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
update_replicas(c, fs_usage, &m->r.e, stripe_sectors); update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
#endif #endif
if (!gc) { /* gc recalculates these fields: */
if (inserting) if (!(flags & BCH_BUCKET_MARK_GC)) {
for (i = 0; i < s.v->nr_blocks; i++) {
m->block_sectors[i] =
stripe_blockcount_get(s.v, i);
m->blocks_nonempty += !!m->block_sectors[i];
}
}
if (!gc)
bch2_stripes_heap_insert(c, m, idx); bch2_stripes_heap_insert(c, m, idx);
else else
bch2_stripes_heap_del(c, m, idx); m->alive = true;
} else {
m->alive = inserting;
} }
spin_unlock(&c->ec_stripes_heap_lock);
bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc); bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
return 0; return 0;
} }
@ -879,6 +893,8 @@ void bch2_mark_update(struct btree_insert *trans,
struct bch_fs_usage *fs_usage; struct bch_fs_usage *fs_usage;
struct gc_pos pos = gc_pos_btree_node(b); struct gc_pos pos = gc_pos_btree_node(b);
struct bkey_packed *_k; struct bkey_packed *_k;
u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
static int warned_disk_usage = 0;
if (!btree_node_type_needs_gc(iter->btree_id)) if (!btree_node_type_needs_gc(iter->btree_id))
return; return;
@ -939,7 +955,37 @@ void bch2_mark_update(struct btree_insert *trans,
bch2_btree_node_iter_advance(&node_iter, b); bch2_btree_node_iter_advance(&node_iter, b);
} }
bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos); if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos) &&
!warned_disk_usage &&
!xchg(&warned_disk_usage, 1)) {
char buf[200];
pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors);
pr_err("while inserting");
bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert->k));
pr_err("%s", buf);
pr_err("overlapping with");
node_iter = iter->l[0].iter;
while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
KEY_TYPE_discard))) {
struct bkey unpacked;
struct bkey_s_c k;
k = bkey_disassemble(b, _k, &unpacked);
if (btree_node_is_extents(b)
? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
: bkey_cmp(insert->k->k.p, k.k->p))
break;
bch2_bkey_val_to_text(&PBUF(buf), c, k);
pr_err("%s", buf);
bch2_btree_node_iter_advance(&node_iter, b);
}
}
percpu_up_read_preempt_enable(&c->mark_lock); percpu_up_read_preempt_enable(&c->mark_lock);
} }

View File

@ -181,6 +181,8 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *); struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
void bch2_dev_usage_from_buckets(struct bch_fs *, struct bch_dev *);
static inline u64 __dev_buckets_available(struct bch_dev *ca, static inline u64 __dev_buckets_available(struct bch_dev *ca,
struct bch_dev_usage stats) struct bch_dev_usage stats)
{ {
@ -264,8 +266,8 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
bool, s64, struct gc_pos, bool, s64, struct gc_pos,
struct bch_fs_usage *, u64, unsigned); struct bch_fs_usage *, u64, unsigned);
void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *); void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
struct disk_reservation *, struct gc_pos); struct disk_reservation *, struct gc_pos);
/* disk reservations: */ /* disk reservations: */

View File

@ -402,6 +402,8 @@ static long bch2_ioctl_usage(struct bch_fs *c,
if (!src) if (!src)
return -ENOMEM; return -ENOMEM;
percpu_up_read_preempt_enable(&c->mark_lock);
dst.used = bch2_fs_sectors_used(c, *src); dst.used = bch2_fs_sectors_used(c, *src);
dst.online_reserved = src->s.online_reserved; dst.online_reserved = src->s.online_reserved;

View File

@ -11,6 +11,7 @@
#include "ec.h" #include "ec.h"
#include "error.h" #include "error.h"
#include "io.h" #include "io.h"
#include "journal_io.h"
#include "keylist.h" #include "keylist.h"
#include "super-io.h" #include "super-io.h"
#include "util.h" #include "util.h"
@ -98,40 +99,6 @@ struct ec_bio {
/* Stripes btree keys: */ /* Stripes btree keys: */
static unsigned stripe_csums_per_device(const struct bch_stripe *s)
{
return DIV_ROUND_UP(le16_to_cpu(s->sectors),
1 << s->csum_granularity_bits);
}
static unsigned stripe_csum_offset(const struct bch_stripe *s,
unsigned dev, unsigned csum_idx)
{
unsigned csum_bytes = bch_crc_bytes[s->csum_type];
return sizeof(struct bch_stripe) +
sizeof(struct bch_extent_ptr) * s->nr_blocks +
(dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
}
static unsigned stripe_blockcount_offset(const struct bch_stripe *s,
unsigned idx)
{
return stripe_csum_offset(s, s->nr_blocks, 0) +
sizeof(16) * idx;
}
static unsigned stripe_val_u64s(const struct bch_stripe *s)
{
return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
sizeof(u64));
}
static void *stripe_csum(struct bch_stripe *s, unsigned dev, unsigned csum_idx)
{
return (void *) s + stripe_csum_offset(s, dev, csum_idx);
}
const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
{ {
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
@ -164,8 +131,9 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
1U << s->csum_granularity_bits); 1U << s->csum_granularity_bits);
for (i = 0; i < s->nr_blocks; i++) for (i = 0; i < s->nr_blocks; i++)
pr_buf(out, " %u:%llu", s->ptrs[i].dev, pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
(u64) s->ptrs[i].offset); (u64) s->ptrs[i].offset,
stripe_blockcount_get(s, i));
} }
static int ptr_matches_stripe(struct bch_fs *c, static int ptr_matches_stripe(struct bch_fs *c,
@ -609,29 +577,15 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
BUG_ON(h->data[m->heap_idx].idx != idx); BUG_ON(h->data[m->heap_idx].idx != idx);
} }
static inline unsigned stripe_entry_blocks(struct stripe *m)
{
return atomic_read(&m->blocks_nonempty);
}
void bch2_stripes_heap_update(struct bch_fs *c, void bch2_stripes_heap_update(struct bch_fs *c,
struct stripe *m, size_t idx) struct stripe *m, size_t idx)
{ {
ec_stripes_heap *h = &c->ec_stripes_heap; ec_stripes_heap *h = &c->ec_stripes_heap;
bool queue_delete;
size_t i; size_t i;
spin_lock(&c->ec_stripes_heap_lock);
if (!m->alive) {
spin_unlock(&c->ec_stripes_heap_lock);
return;
}
heap_verify_backpointer(c, idx); heap_verify_backpointer(c, idx);
h->data[m->heap_idx].blocks_nonempty = h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
stripe_entry_blocks(m);
i = m->heap_idx; i = m->heap_idx;
heap_sift_up(h, i, ec_stripes_heap_cmp, heap_sift_up(h, i, ec_stripes_heap_cmp,
@ -641,44 +595,35 @@ void bch2_stripes_heap_update(struct bch_fs *c,
heap_verify_backpointer(c, idx); heap_verify_backpointer(c, idx);
queue_delete = stripe_idx_to_delete(c) >= 0; if (stripe_idx_to_delete(c) >= 0)
spin_unlock(&c->ec_stripes_heap_lock);
if (queue_delete)
schedule_work(&c->ec_stripe_delete_work); schedule_work(&c->ec_stripe_delete_work);
} }
void bch2_stripes_heap_del(struct bch_fs *c, void bch2_stripes_heap_del(struct bch_fs *c,
struct stripe *m, size_t idx) struct stripe *m, size_t idx)
{ {
spin_lock(&c->ec_stripes_heap_lock);
heap_verify_backpointer(c, idx); heap_verify_backpointer(c, idx);
m->alive = false; m->alive = false;
heap_del(&c->ec_stripes_heap, m->heap_idx, heap_del(&c->ec_stripes_heap, m->heap_idx,
ec_stripes_heap_cmp, ec_stripes_heap_cmp,
ec_stripes_heap_set_backpointer); ec_stripes_heap_set_backpointer);
spin_unlock(&c->ec_stripes_heap_lock);
} }
void bch2_stripes_heap_insert(struct bch_fs *c, void bch2_stripes_heap_insert(struct bch_fs *c,
struct stripe *m, size_t idx) struct stripe *m, size_t idx)
{ {
spin_lock(&c->ec_stripes_heap_lock);
BUG_ON(heap_full(&c->ec_stripes_heap)); BUG_ON(heap_full(&c->ec_stripes_heap));
heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
.idx = idx, .idx = idx,
.blocks_nonempty = stripe_entry_blocks(m), .blocks_nonempty = m->blocks_nonempty,
}), }),
ec_stripes_heap_cmp, ec_stripes_heap_cmp,
ec_stripes_heap_set_backpointer); ec_stripes_heap_set_backpointer);
m->alive = true; m->alive = true;
heap_verify_backpointer(c, idx); heap_verify_backpointer(c, idx);
spin_unlock(&c->ec_stripes_heap_lock);
} }
/* stripe deletion */ /* stripe deletion */
@ -1217,6 +1162,116 @@ unlock:
mutex_unlock(&c->ec_new_stripe_lock); mutex_unlock(&c->ec_new_stripe_lock);
} }
static int __bch2_stripe_write_key(struct bch_fs *c,
struct btree_iter *iter,
struct stripe *m,
size_t idx,
struct bkey_i_stripe *new_key,
unsigned flags)
{
struct bkey_s_c k;
unsigned i;
int ret;
bch2_btree_iter_set_pos(iter, POS(0, idx));
k = bch2_btree_iter_peek_slot(iter);
ret = btree_iter_err(k);
if (ret)
return ret;
if (k.k->type != KEY_TYPE_stripe)
return -EIO;
bkey_reassemble(&new_key->k_i, k);
spin_lock(&c->ec_stripes_heap_lock);
for (i = 0; i < new_key->v.nr_blocks; i++)
stripe_blockcount_set(&new_key->v, i,
m->block_sectors[i]);
m->dirty = false;
spin_unlock(&c->ec_stripes_heap_lock);
return bch2_btree_insert_at(c, NULL, NULL,
BTREE_INSERT_NOFAIL|flags,
BTREE_INSERT_ENTRY(iter, &new_key->k_i));
}
int bch2_stripes_write(struct bch_fs *c, bool *wrote)
{
struct btree_iter iter;
struct genradix_iter giter;
struct bkey_i_stripe *new_key;
struct stripe *m;
int ret = 0;
new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL);
BUG_ON(!new_key);
bch2_btree_iter_init(&iter, c, BTREE_ID_EC, POS_MIN,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
genradix_for_each(&c->stripes[0], giter, m) {
if (!m->dirty)
continue;
ret = __bch2_stripe_write_key(c, &iter, m, giter.pos,
new_key, BTREE_INSERT_NOCHECK_RW);
if (ret)
break;
*wrote = true;
}
bch2_btree_iter_unlock(&iter);
kfree(new_key);
return ret;
}
static void bch2_stripe_read_key(struct bch_fs *c, struct bkey_s_c k)
{
struct gc_pos pos = { 0 };
bch2_mark_key(c, k, true, 0, pos, NULL, 0, 0);
}
int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list)
{
struct journal_replay *r;
struct btree_iter iter;
struct bkey_s_c k;
int ret;
ret = bch2_fs_ec_start(c);
if (ret)
return ret;
for_each_btree_key(&iter, c, BTREE_ID_EC, POS_MIN, 0, k) {
bch2_stripe_read_key(c, k);
bch2_btree_iter_cond_resched(&iter);
}
ret = bch2_btree_iter_unlock(&iter);
if (ret)
return ret;
list_for_each_entry(r, journal_replay_list, list) {
struct bkey_i *k, *n;
struct jset_entry *entry;
for_each_jset_key(k, n, entry, &r->j)
if (entry->btree_id == BTREE_ID_EC)
bch2_stripe_read_key(c, bkey_i_to_s_c(k));
}
return 0;
}
int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
{ {
struct btree_iter iter; struct btree_iter iter;

View File

@ -13,6 +13,55 @@ void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
.val_to_text = bch2_stripe_to_text, \ .val_to_text = bch2_stripe_to_text, \
} }
static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
{
return DIV_ROUND_UP(le16_to_cpu(s->sectors),
1 << s->csum_granularity_bits);
}
static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
unsigned dev, unsigned csum_idx)
{
unsigned csum_bytes = bch_crc_bytes[s->csum_type];
return sizeof(struct bch_stripe) +
sizeof(struct bch_extent_ptr) * s->nr_blocks +
(dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
}
static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s,
unsigned idx)
{
return stripe_csum_offset(s, s->nr_blocks, 0) +
sizeof(u16) * idx;
}
static inline unsigned stripe_blockcount_get(const struct bch_stripe *s,
unsigned idx)
{
return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx));
}
static inline void stripe_blockcount_set(struct bch_stripe *s,
unsigned idx, unsigned v)
{
__le16 *p = (void *) s + stripe_blockcount_offset(s, idx);
*p = cpu_to_le16(v);
}
static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
{
return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
sizeof(u64));
}
static inline void *stripe_csum(struct bch_stripe *s,
unsigned dev, unsigned csum_idx)
{
return (void *) s + stripe_csum_offset(s, dev, csum_idx);
}
struct bch_read_bio; struct bch_read_bio;
struct ec_stripe_buf { struct ec_stripe_buf {
@ -100,6 +149,9 @@ void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
void bch2_ec_flush_new_stripes(struct bch_fs *); void bch2_ec_flush_new_stripes(struct bch_fs *);
int bch2_stripes_read(struct bch_fs *, struct list_head *);
int bch2_stripes_write(struct bch_fs *, bool *);
int bch2_ec_mem_alloc(struct bch_fs *, bool); int bch2_ec_mem_alloc(struct bch_fs *, bool);
int bch2_fs_ec_start(struct bch_fs *); int bch2_fs_ec_start(struct bch_fs *);

View File

@ -19,9 +19,10 @@ struct stripe {
u8 nr_blocks; u8 nr_blocks;
u8 nr_redundant; u8 nr_redundant;
u8 alive; unsigned alive:1;
atomic_t blocks_nonempty; unsigned dirty:1;
atomic_t block_sectors[EC_STRIPE_MAX]; u8 blocks_nonempty;
u16 block_sectors[EC_STRIPE_MAX];
struct bch_replicas_padded r; struct bch_replicas_padded r;
}; };

View File

@ -1664,12 +1664,13 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
return ret == BCH_MERGE_MERGE; return ret == BCH_MERGE_MERGE;
} }
int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size) bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
unsigned nr_replicas)
{ {
struct btree_iter iter; struct btree_iter iter;
struct bpos end = pos; struct bpos end = pos;
struct bkey_s_c k; struct bkey_s_c k;
int ret = 0; bool ret = true;
end.offset += size; end.offset += size;
@ -1678,8 +1679,8 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break; break;
if (!bch2_extent_is_fully_allocated(k)) { if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) {
ret = -ENOSPC; ret = false;
break; break;
} }
} }
@ -1688,6 +1689,29 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
return ret; return ret;
} }
unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
{
unsigned ret = 0;
switch (k.k->type) {
case KEY_TYPE_extent: {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
extent_for_each_ptr_decode(e, p, entry)
ret += !p.ptr.cached &&
p.crc.compression_type == BCH_COMPRESSION_NONE;
break;
}
case KEY_TYPE_reservation:
ret = bkey_s_c_to_reservation(k).v->nr_replicas;
break;
}
return ret;
}
/* KEY_TYPE_reservation: */ /* KEY_TYPE_reservation: */
const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)

View File

@ -571,6 +571,7 @@ static inline void extent_save(struct btree *b, struct bkey_packed *dst,
BUG_ON(!bch2_bkey_pack_key(dst, src, f)); BUG_ON(!bch2_bkey_pack_key(dst, src, f));
} }
int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64); bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
#endif /* _BCACHEFS_EXTENTS_H */ #endif /* _BCACHEFS_EXTENTS_H */

View File

@ -262,18 +262,20 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
} }
} }
static inline size_t eytzinger0_find(void *base, size_t nr, size_t size, #define eytzinger0_find(base, nr, size, _cmp, search) \
eytzinger_cmp_fn cmp, const void *search) ({ \
{ void *_base = (base); \
size_t i = 0; void *_search = (search); \
int res; size_t _nr = (nr); \
size_t _size = (size); \
while (i < nr && size_t _i = 0; \
(res = cmp(search, base + i * size, size))) int _res; \
i = eytzinger0_child(i, res > 0); \
while (_i < _nr && \
return i; (_res = _cmp(_search, _base + _i * _size, _size))) \
} _i = eytzinger0_child(_i, _res > 0); \
_i; \
})
void eytzinger0_sort(void *, size_t, size_t, void eytzinger0_sort(void *, size_t, size_t,
int (*cmp_func)(const void *, const void *, size_t), int (*cmp_func)(const void *, const void *, size_t),

View File

@ -253,7 +253,9 @@ static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
BUG_ON(btree_iter_err(old)); BUG_ON(btree_iter_err(old));
if (allocating && if (allocating &&
!bch2_extent_is_fully_allocated(old)) !*allocating &&
bch2_bkey_nr_ptrs_allocated(old) <
bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new)))
*allocating = true; *allocating = true;
delta += (min(new->k.p.offset, delta += (min(new->k.p.offset,
@ -858,9 +860,7 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
{ {
struct bvec_iter iter; struct bvec_iter iter;
struct bio_vec bv; struct bio_vec bv;
unsigned nr_ptrs = !bch2_extent_is_compressed(k) unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k);
? bch2_bkey_nr_dirty_ptrs(k)
: 0;
bio_for_each_segment(bv, bio, iter) { bio_for_each_segment(bv, bio, iter) {
/* brand new pages, don't need to be locked: */ /* brand new pages, don't need to be locked: */
@ -1759,6 +1759,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
struct bch_inode_info *inode = dio->iop.inode; struct bch_inode_info *inode = dio->iop.inode;
struct bio *bio = &dio->iop.op.wbio.bio; struct bio *bio = &dio->iop.op.wbio.bio;
struct bio_vec *bv; struct bio_vec *bv;
loff_t offset;
bool sync; bool sync;
long ret; long ret;
int i; int i;
@ -1770,12 +1771,16 @@ static long bch2_dio_write_loop(struct dio_write *dio)
__pagecache_block_get(&mapping->add_lock); __pagecache_block_get(&mapping->add_lock);
/* Write and invalidate pagecache range that we're writing to: */ /* Write and invalidate pagecache range that we're writing to: */
ret = write_invalidate_inode_pages_range(mapping, req->ki_pos, offset = req->ki_pos + (dio->iop.op.written << 9);
req->ki_pos + iov_iter_count(&dio->iter) - 1); ret = write_invalidate_inode_pages_range(mapping,
offset,
offset + iov_iter_count(&dio->iter) - 1);
if (unlikely(ret)) if (unlikely(ret))
goto err; goto err;
while (1) { while (1) {
offset = req->ki_pos + (dio->iop.op.written << 9);
BUG_ON(current->pagecache_lock); BUG_ON(current->pagecache_lock);
current->pagecache_lock = &mapping->add_lock; current->pagecache_lock = &mapping->add_lock;
if (kthread) if (kthread)
@ -1792,13 +1797,12 @@ static long bch2_dio_write_loop(struct dio_write *dio)
/* gup might have faulted pages back in: */ /* gup might have faulted pages back in: */
ret = write_invalidate_inode_pages_range(mapping, ret = write_invalidate_inode_pages_range(mapping,
req->ki_pos + (dio->iop.op.written << 9), offset,
req->ki_pos + iov_iter_count(&dio->iter) - 1); offset + bio->bi_iter.bi_size - 1);
if (unlikely(ret)) if (unlikely(ret))
goto err; goto err;
dio->iop.op.pos = POS(inode->v.i_ino, dio->iop.op.pos = POS(inode->v.i_ino, offset >> 9);
(req->ki_pos >> 9) + dio->iop.op.written);
task_io_account_write(bio->bi_iter.bi_size); task_io_account_write(bio->bi_iter.bi_size);
@ -1878,7 +1882,6 @@ static int bch2_direct_IO_write(struct kiocb *req,
struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct dio_write *dio; struct dio_write *dio;
struct bio *bio; struct bio *bio;
loff_t offset = req->ki_pos;
ssize_t ret; ssize_t ret;
lockdep_assert_held(&inode->v.i_rwsem); lockdep_assert_held(&inode->v.i_rwsem);
@ -1886,7 +1889,7 @@ static int bch2_direct_IO_write(struct kiocb *req,
if (unlikely(!iter->count)) if (unlikely(!iter->count))
return 0; return 0;
if (unlikely((offset|iter->count) & (block_bytes(c) - 1))) if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
return -EINVAL; return -EINVAL;
bio = bio_alloc_bioset(GFP_KERNEL, bio = bio_alloc_bioset(GFP_KERNEL,
@ -1898,7 +1901,7 @@ static int bch2_direct_IO_write(struct kiocb *req,
dio->mm = current->mm; dio->mm = current->mm;
dio->loop = false; dio->loop = false;
dio->sync = is_sync_kiocb(req) || dio->sync = is_sync_kiocb(req) ||
offset + iter->count > inode->v.i_size; req->ki_pos + iter->count > inode->v.i_size;
dio->free_iov = false; dio->free_iov = false;
dio->quota_res.sectors = 0; dio->quota_res.sectors = 0;
dio->iter = *iter; dio->iter = *iter;
@ -1915,19 +1918,20 @@ static int bch2_direct_IO_write(struct kiocb *req,
if (unlikely(ret)) if (unlikely(ret))
goto err; goto err;
dio->iop.op.nr_replicas = dio->iop.op.opts.data_replicas;
ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9, ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9,
dio->iop.op.opts.data_replicas, 0); dio->iop.op.opts.data_replicas, 0);
if (unlikely(ret)) { if (unlikely(ret)) {
if (bch2_check_range_allocated(c, POS(inode->v.i_ino, if (!bch2_check_range_allocated(c, POS(inode->v.i_ino,
offset >> 9), req->ki_pos >> 9),
iter->count >> 9)) iter->count >> 9,
dio->iop.op.opts.data_replicas))
goto err; goto err;
dio->iop.unalloc = true; dio->iop.unalloc = true;
} }
dio->iop.op.nr_replicas = dio->iop.op.res.nr_replicas;
return bch2_dio_write_loop(dio); return bch2_dio_write_loop(dio);
err: err:
bch2_disk_reservation_put(c, &dio->iop.op.res); bch2_disk_reservation_put(c, &dio->iop.op.res);

View File

@ -694,6 +694,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
} }
list_for_each_entry(i, list, list) { list_for_each_entry(i, list, list) {
struct bch_replicas_padded replicas;
char buf[80];
bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
ret = jset_validate_entries(c, &i->j, READ); ret = jset_validate_entries(c, &i->j, READ);
if (ret) if (ret)
goto fsck_err; goto fsck_err;
@ -705,11 +710,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
if (!degraded && if (!degraded &&
(test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL, fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c,
i->devs, false), c, "superblock not marked as containing replicas %s",
"superblock not marked as containing replicas (type %u)", (bch2_replicas_entry_to_text(&PBUF(buf),
BCH_DATA_JOURNAL))) { &replicas.e), buf)))) {
ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs); ret = bch2_mark_replicas(c, &replicas.e);
if (ret) if (ret)
return ret; return ret;
} }
@ -1108,6 +1113,7 @@ static void journal_write_done(struct closure *cl)
struct journal_buf *w = journal_prev_buf(j); struct journal_buf *w = journal_prev_buf(j);
struct bch_devs_list devs = struct bch_devs_list devs =
bch2_bkey_devs(bkey_i_to_s_c(&w->key)); bch2_bkey_devs(bkey_i_to_s_c(&w->key));
struct bch_replicas_padded replicas;
u64 seq = le64_to_cpu(w->data->seq); u64 seq = le64_to_cpu(w->data->seq);
u64 last_seq = le64_to_cpu(w->data->last_seq); u64 last_seq = le64_to_cpu(w->data->last_seq);
@ -1118,7 +1124,9 @@ static void journal_write_done(struct closure *cl)
goto err; goto err;
} }
if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs)) bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs);
if (bch2_mark_replicas(c, &replicas.e))
goto err; goto err;
spin_lock(&j->lock); spin_lock(&j->lock);

View File

@ -335,7 +335,7 @@ void bch2_journal_reclaim_work(struct work_struct *work)
mutex_unlock(&j->reclaim_lock); mutex_unlock(&j->reclaim_lock);
if (!test_bit(BCH_FS_RO, &c->flags)) if (!test_bit(BCH_FS_RO, &c->flags))
queue_delayed_work(system_freezable_wq, &j->reclaim_work, queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
msecs_to_jiffies(j->reclaim_delay_ms)); msecs_to_jiffies(j->reclaim_delay_ms));
} }
@ -387,7 +387,6 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
{ {
struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_entry_pin_list *p; struct journal_entry_pin_list *p;
struct bch_devs_list devs;
u64 iter, seq = 0; u64 iter, seq = 0;
int ret = 0; int ret = 0;
@ -412,12 +411,15 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
spin_lock(&j->lock); spin_lock(&j->lock);
while (!ret && seq < j->pin.back) { while (!ret && seq < j->pin.back) {
struct bch_replicas_padded replicas;
seq = max(seq, journal_last_seq(j)); seq = max(seq, journal_last_seq(j));
devs = journal_seq_pin(j, seq)->devs; bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
journal_seq_pin(j, seq)->devs);
seq++; seq++;
spin_unlock(&j->lock); spin_unlock(&j->lock);
ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs); ret = bch2_mark_replicas(c, &replicas.e);
spin_lock(&j->lock); spin_lock(&j->lock);
} }
spin_unlock(&j->lock); spin_unlock(&j->lock);

View File

@ -4,6 +4,7 @@
#include "bcachefs.h" #include "bcachefs.h"
#include "btree_update.h" #include "btree_update.h"
#include "btree_update_interior.h"
#include "buckets.h" #include "buckets.h"
#include "extents.h" #include "extents.h"
#include "io.h" #include "io.h"
@ -152,6 +153,16 @@ retry:
bch2_btree_iter_unlock(&iter); bch2_btree_iter_unlock(&iter);
} }
/* flush relevant btree updates */
while (1) {
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c) ||
c->btree_roots_dirty);
if (!bch2_btree_interior_updates_nr_pending(c))
break;
bch2_journal_meta(&c->journal);
}
ret = 0; ret = 0;
out: out:
ret = bch2_replicas_gc_end(c, ret); ret = bch2_replicas_gc_end(c, ret);

View File

@ -3,6 +3,7 @@
#include "alloc_foreground.h" #include "alloc_foreground.h"
#include "btree_gc.h" #include "btree_gc.h"
#include "btree_update.h" #include "btree_update.h"
#include "btree_update_interior.h"
#include "buckets.h" #include "buckets.h"
#include "disk_groups.h" #include "disk_groups.h"
#include "inode.h" #include "inode.h"
@ -763,6 +764,16 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_journal_flush_device_pins(&c->journal, -1); ret = bch2_journal_flush_device_pins(&c->journal, -1);
ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
while (1) {
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c) ||
c->btree_roots_dirty);
if (!bch2_btree_interior_updates_nr_pending(c))
break;
bch2_journal_meta(&c->journal);
}
ret = bch2_gc_btree_replicas(c) ?: ret; ret = bch2_gc_btree_replicas(c) ?: ret;
ret = bch2_move_data(c, NULL, ret = bch2_move_data(c, NULL,

View File

@ -214,12 +214,12 @@ int bch2_fs_recovery(struct bch_fs *c)
if (ret) if (ret)
goto err; goto err;
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); ret = bch2_stripes_read(c, &journal);
err = "cannot allocate memory";
ret = bch2_fs_ec_start(c);
if (ret) if (ret)
goto err; goto err;
pr_info("stripes_read done");
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
bch_verbose(c, "starting mark and sweep:"); bch_verbose(c, "starting mark and sweep:");
err = "error in recovery"; err = "error in recovery";

View File

@ -13,6 +13,16 @@ static inline int u8_cmp(u8 l, u8 r)
return (l > r) - (l < r); return (l > r) - (l < r);
} }
static void verify_replicas_entry_sorted(struct bch_replicas_entry *e)
{
#ifdef CONFIG_BCACHES_DEBUG
unsigned i;
for (i = 0; i + 1 < e->nr_devs; i++)
BUG_ON(e->devs[i] >= e->devs[i + 1]);
#endif
}
static void replicas_entry_sort(struct bch_replicas_entry *e) static void replicas_entry_sort(struct bch_replicas_entry *e)
{ {
bubble_sort(e->devs, e->nr_devs, u8_cmp); bubble_sort(e->devs, e->nr_devs, u8_cmp);
@ -23,19 +33,13 @@ static void replicas_entry_sort(struct bch_replicas_entry *e)
(void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
_i = (void *) (_i) + (_r)->entry_size) _i = (void *) (_i) + (_r)->entry_size)
static inline struct bch_replicas_entry *
cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
{
return (void *) r->entries + r->entry_size * i;
}
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{ {
eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
} }
static void replicas_entry_to_text(struct printbuf *out, void bch2_replicas_entry_to_text(struct printbuf *out,
struct bch_replicas_entry *e) struct bch_replicas_entry *e)
{ {
unsigned i; unsigned i;
@ -60,7 +64,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out,
pr_buf(out, " "); pr_buf(out, " ");
first = false; first = false;
replicas_entry_to_text(out, e); bch2_replicas_entry_to_text(out, e);
} }
} }
@ -100,8 +104,8 @@ static void stripe_to_replicas(struct bkey_s_c k,
r->devs[r->nr_devs++] = ptr->dev; r->devs[r->nr_devs++] = ptr->dev;
} }
static void bkey_to_replicas(struct bkey_s_c k, static void bkey_to_replicas(struct bch_replicas_entry *e,
struct bch_replicas_entry *e) struct bkey_s_c k)
{ {
e->nr_devs = 0; e->nr_devs = 0;
@ -119,11 +123,13 @@ static void bkey_to_replicas(struct bkey_s_c k,
stripe_to_replicas(k, e); stripe_to_replicas(k, e);
break; break;
} }
replicas_entry_sort(e);
} }
static inline void devlist_to_replicas(struct bch_devs_list devs, void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
enum bch_data_type data_type, enum bch_data_type data_type,
struct bch_replicas_entry *e) struct bch_devs_list devs)
{ {
unsigned i; unsigned i;
@ -137,6 +143,8 @@ static inline void devlist_to_replicas(struct bch_devs_list devs,
for (i = 0; i < devs.nr; i++) for (i = 0; i < devs.nr; i++)
e->devs[e->nr_devs++] = devs.devs[i]; e->devs[e->nr_devs++] = devs.devs[i];
replicas_entry_sort(e);
} }
static struct bch_replicas_cpu static struct bch_replicas_cpu
@ -150,6 +158,9 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
replicas_entry_bytes(new_entry)), replicas_entry_bytes(new_entry)),
}; };
BUG_ON(!new_entry->data_type);
verify_replicas_entry_sorted(new_entry);
new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO); new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
if (!new.entries) if (!new.entries)
return new; return new;
@ -175,13 +186,12 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
if (unlikely(entry_size > r->entry_size)) if (unlikely(entry_size > r->entry_size))
return -1; return -1;
replicas_entry_sort(search); verify_replicas_entry_sorted(search);
while (entry_size < r->entry_size)
((char *) search)[entry_size++] = 0;
#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size)
idx = eytzinger0_find(r->entries, r->nr, r->entry_size, idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
memcmp, search); entry_cmp, search);
#undef entry_cmp
return idx < r->nr ? idx : -1; return idx < r->nr ? idx : -1;
} }
@ -189,6 +199,8 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
int bch2_replicas_entry_idx(struct bch_fs *c, int bch2_replicas_entry_idx(struct bch_fs *c,
struct bch_replicas_entry *search) struct bch_replicas_entry *search)
{ {
replicas_entry_sort(search);
return __replicas_entry_idx(&c->replicas, search); return __replicas_entry_idx(&c->replicas, search);
} }
@ -198,12 +210,17 @@ static bool __replicas_has_entry(struct bch_replicas_cpu *r,
return __replicas_entry_idx(r, search) >= 0; return __replicas_entry_idx(r, search) >= 0;
} }
static bool replicas_has_entry(struct bch_fs *c, bool bch2_replicas_marked(struct bch_fs *c,
struct bch_replicas_entry *search, struct bch_replicas_entry *search,
bool check_gc_replicas) bool check_gc_replicas)
{ {
bool marked; bool marked;
if (!search->nr_devs)
return true;
verify_replicas_entry_sorted(search);
percpu_down_read_preempt_disable(&c->mark_lock); percpu_down_read_preempt_disable(&c->mark_lock);
marked = __replicas_has_entry(&c->replicas, search) && marked = __replicas_has_entry(&c->replicas, search) &&
(!check_gc_replicas || (!check_gc_replicas ||
@ -214,35 +231,31 @@ static bool replicas_has_entry(struct bch_fs *c,
return marked; return marked;
} }
static void __replicas_table_update(struct bch_fs_usage __percpu *dst, static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p,
struct bch_replicas_cpu *dst_r, struct bch_replicas_cpu *dst_r,
struct bch_fs_usage __percpu *src, struct bch_fs_usage __percpu *src_p,
struct bch_replicas_cpu *src_r) struct bch_replicas_cpu *src_r)
{ {
int src_idx, dst_idx, cpu; unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
struct bch_fs_usage *dst, *src = (void *)
bch2_acc_percpu_u64s((void *) src_p, src_nr);
int src_idx, dst_idx;
preempt_disable();
dst = this_cpu_ptr(dst_p);
preempt_enable();
*dst = *src;
for (src_idx = 0; src_idx < src_r->nr; src_idx++) { for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
u64 *dst_v, src_v = 0; if (!src->data[src_idx])
continue;
for_each_possible_cpu(cpu)
src_v += *per_cpu_ptr(&src->data[src_idx], cpu);
dst_idx = __replicas_entry_idx(dst_r, dst_idx = __replicas_entry_idx(dst_r,
cpu_replicas_entry(src_r, src_idx)); cpu_replicas_entry(src_r, src_idx));
BUG_ON(dst_idx < 0);
if (dst_idx < 0) { dst->data[dst_idx] = src->data[src_idx];
BUG_ON(src_v);
continue;
}
preempt_disable();
dst_v = this_cpu_ptr(&dst->data[dst_idx]);
BUG_ON(*dst_v);
*dst_v = src_v;
preempt_enable();
} }
} }
@ -344,30 +357,32 @@ err:
return ret; return ret;
} }
static int __bch2_mark_replicas(struct bch_fs *c, int bch2_mark_replicas(struct bch_fs *c,
struct bch_replicas_entry *devs) struct bch_replicas_entry *r)
{ {
return likely(replicas_has_entry(c, devs, true)) return likely(bch2_replicas_marked(c, r, true))
? 0 ? 0
: bch2_mark_replicas_slowpath(c, devs); : bch2_mark_replicas_slowpath(c, r);
} }
int bch2_mark_replicas(struct bch_fs *c, bool bch2_bkey_replicas_marked(struct bch_fs *c,
enum bch_data_type data_type, struct bkey_s_c k,
struct bch_devs_list devs) bool check_gc_replicas)
{ {
struct bch_replicas_padded search; struct bch_replicas_padded search;
struct bch_devs_list cached = bch2_bkey_cached_devs(k);
unsigned i;
if (!devs.nr) for (i = 0; i < cached.nr; i++) {
return 0; bch2_replicas_entry_cached(&search.e, cached.devs[i]);
memset(&search, 0, sizeof(search)); if (!bch2_replicas_marked(c, &search.e, check_gc_replicas))
return false;
}
BUG_ON(devs.nr >= BCH_REPLICAS_MAX); bkey_to_replicas(&search.e, k);
devlist_to_replicas(devs, data_type, &search.e); return bch2_replicas_marked(c, &search.e, check_gc_replicas);
return __bch2_mark_replicas(c, &search.e);
} }
int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
@ -377,18 +392,17 @@ int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
unsigned i; unsigned i;
int ret; int ret;
memset(&search, 0, sizeof(search)); for (i = 0; i < cached.nr; i++) {
bch2_replicas_entry_cached(&search.e, cached.devs[i]);
for (i = 0; i < cached.nr; i++) ret = bch2_mark_replicas(c, &search.e);
if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED, if (ret)
bch2_dev_list_single(cached.devs[i]))))
return ret; return ret;
}
bkey_to_replicas(k, &search.e); bkey_to_replicas(&search.e, k);
return search.e.nr_devs return bch2_mark_replicas(c, &search.e);
? __bch2_mark_replicas(c, &search.e)
: 0;
} }
int bch2_replicas_gc_end(struct bch_fs *c, int ret) int bch2_replicas_gc_end(struct bch_fs *c, int ret)
@ -749,7 +763,7 @@ static void bch2_sb_replicas_to_text(struct printbuf *out,
pr_buf(out, " "); pr_buf(out, " ");
first = false; first = false;
replicas_entry_to_text(out, e); bch2_replicas_entry_to_text(out, e);
} }
} }
@ -798,46 +812,6 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
/* Query replicas: */ /* Query replicas: */
bool bch2_replicas_marked(struct bch_fs *c,
enum bch_data_type data_type,
struct bch_devs_list devs,
bool check_gc_replicas)
{
struct bch_replicas_padded search;
if (!devs.nr)
return true;
memset(&search, 0, sizeof(search));
devlist_to_replicas(devs, data_type, &search.e);
return replicas_has_entry(c, &search.e, check_gc_replicas);
}
bool bch2_bkey_replicas_marked(struct bch_fs *c,
struct bkey_s_c k,
bool check_gc_replicas)
{
struct bch_replicas_padded search;
struct bch_devs_list cached = bch2_bkey_cached_devs(k);
unsigned i;
memset(&search, 0, sizeof(search));
for (i = 0; i < cached.nr; i++)
if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
bch2_dev_list_single(cached.devs[i]),
check_gc_replicas))
return false;
bkey_to_replicas(k, &search.e);
return search.e.nr_devs
? replicas_has_entry(c, &search.e, check_gc_replicas)
: true;
}
struct replicas_status __bch2_replicas_status(struct bch_fs *c, struct replicas_status __bch2_replicas_status(struct bch_fs *c,
struct bch_devs_mask online_devs) struct bch_devs_mask online_devs)
{ {

View File

@ -4,17 +4,39 @@
#include "eytzinger.h" #include "eytzinger.h"
#include "replicas_types.h" #include "replicas_types.h"
void bch2_replicas_entry_to_text(struct printbuf *,
struct bch_replicas_entry *);
void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
static inline struct bch_replicas_entry *
cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
{
return (void *) r->entries + r->entry_size * i;
}
int bch2_replicas_entry_idx(struct bch_fs *, int bch2_replicas_entry_idx(struct bch_fs *,
struct bch_replicas_entry *); struct bch_replicas_entry *);
bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
struct bch_devs_list, bool); void bch2_devlist_to_replicas(struct bch_replicas_entry *,
enum bch_data_type,
struct bch_devs_list);
bool bch2_replicas_marked(struct bch_fs *,
struct bch_replicas_entry *, bool);
int bch2_mark_replicas(struct bch_fs *,
struct bch_replicas_entry *);
bool bch2_bkey_replicas_marked(struct bch_fs *, bool bch2_bkey_replicas_marked(struct bch_fs *,
struct bkey_s_c, bool); struct bkey_s_c, bool);
int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
struct bch_devs_list);
int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
unsigned dev)
{
e->data_type = BCH_DATA_CACHED;
e->nr_devs = 1;
e->nr_required = 1;
e->devs[0] = dev;
}
struct replicas_status { struct replicas_status {
struct { struct {

View File

@ -205,7 +205,9 @@ int bch2_congested(void *data, int bdi_bits)
static void __bch2_fs_read_only(struct bch_fs *c) static void __bch2_fs_read_only(struct bch_fs *c)
{ {
struct bch_dev *ca; struct bch_dev *ca;
bool wrote;
unsigned i; unsigned i;
int ret;
bch2_rebalance_stop(c); bch2_rebalance_stop(c);
@ -220,23 +222,42 @@ static void __bch2_fs_read_only(struct bch_fs *c)
*/ */
bch2_journal_flush_all_pins(&c->journal); bch2_journal_flush_all_pins(&c->journal);
do {
ret = bch2_alloc_write(c, false, &wrote);
if (ret) {
bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
break;
}
ret = bch2_stripes_write(c, &wrote);
if (ret) {
bch2_fs_inconsistent(c, "error writing out stripes");
break;
}
for_each_member_device(ca, c, i)
bch2_dev_allocator_quiesce(c, ca);
bch2_journal_flush_all_pins(&c->journal);
/*
* We need to explicitly wait on btree interior updates to complete
* before stopping the journal, flushing all journal pins isn't
* sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
* interior updates have to drop their journal pin before they're
* fully complete:
*/
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
} while (wrote);
for_each_member_device(ca, c, i) for_each_member_device(ca, c, i)
bch2_dev_allocator_stop(ca); bch2_dev_allocator_stop(ca);
bch2_journal_flush_all_pins(&c->journal);
/*
* We need to explicitly wait on btree interior updates to complete
* before stopping the journal, flushing all journal pins isn't
* sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
* interior updates have to drop their journal pin before they're
* fully complete:
*/
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
bch2_fs_journal_stop(&c->journal); bch2_fs_journal_stop(&c->journal);
/* XXX: mark super that alloc info is persistent */
/* /*
* the journal kicks off btree writes via reclaim - wait for in flight * the journal kicks off btree writes via reclaim - wait for in flight
* writes after stopping journal: * writes after stopping journal:
@ -420,6 +441,8 @@ static void bch2_fs_free(struct bch_fs *c)
kfree(c->replicas_gc.entries); kfree(c->replicas_gc.entries);
kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(rcu_dereference_protected(c->disk_groups, 1));
if (c->journal_reclaim_wq)
destroy_workqueue(c->journal_reclaim_wq);
if (c->copygc_wq) if (c->copygc_wq)
destroy_workqueue(c->copygc_wq); destroy_workqueue(c->copygc_wq);
if (c->wq) if (c->wq)
@ -638,6 +661,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcache_copygc", !(c->copygc_wq = alloc_workqueue("bcache_copygc",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
!(c->journal_reclaim_wq = alloc_workqueue("bcache_journal",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
percpu_ref_init(&c->writes, bch2_writes_disabled, 0, GFP_KERNEL) || percpu_ref_init(&c->writes, bch2_writes_disabled, 0, GFP_KERNEL) ||
mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1, mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
sizeof(struct btree_reserve)) || sizeof(struct btree_reserve)) ||
@ -1297,8 +1322,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
if (data) { if (data) {
char data_has_str[100]; char data_has_str[100];
bch2_string_opt_to_text(&PBUF(data_has_str), bch2_flags_to_text(&PBUF(data_has_str),
bch2_data_types, data); bch2_data_types, data);
bch_err(ca, "Remove failed, still has data (%s)", data_has_str); bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
ret = -EBUSY; ret = -EBUSY;
goto err; goto err;

View File

@ -234,17 +234,17 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
{ {
struct printbuf out = _PBUF(buf, PAGE_SIZE); struct printbuf out = _PBUF(buf, PAGE_SIZE);
struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c); struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c);
unsigned replicas; unsigned i;
if (!fs_usage) if (!fs_usage)
return -ENOMEM; return -ENOMEM;
pr_buf(&out, "capacity:\t\t%llu\n", c->capacity); pr_buf(&out, "capacity:\t\t%llu\n", c->capacity);
for (replicas = 0; for (i = 0;
replicas < ARRAY_SIZE(fs_usage->persistent_reserved); i < ARRAY_SIZE(fs_usage->persistent_reserved);
replicas++) { i++) {
pr_buf(&out, "%u replicas:\n", replicas + 1); pr_buf(&out, "%u replicas:\n", i + 1);
#if 0 #if 0
for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++) for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++)
pr_buf(&out, "\t%s:\t\t%llu\n", pr_buf(&out, "\t%s:\t\t%llu\n",
@ -254,12 +254,23 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
stats.replicas[replicas].ec_data); stats.replicas[replicas].ec_data);
#endif #endif
pr_buf(&out, "\treserved:\t%llu\n", pr_buf(&out, "\treserved:\t%llu\n",
fs_usage->persistent_reserved[replicas]); fs_usage->persistent_reserved[i]);
} }
pr_buf(&out, "online reserved:\t%llu\n", pr_buf(&out, "online reserved:\t%llu\n",
fs_usage->s.online_reserved); fs_usage->s.online_reserved);
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
pr_buf(&out, "\t");
bch2_replicas_entry_to_text(&out, e);
pr_buf(&out, ":\t%llu\n", fs_usage->data[i]);
}
percpu_up_read_preempt_enable(&c->mark_lock);
kfree(fs_usage); kfree(fs_usage);
return out.pos - buf; return out.pos - buf;
@ -797,6 +808,12 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
{ {
struct bch_fs *c = ca->fs; struct bch_fs *c = ca->fs;
struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
unsigned i, nr[BCH_DATA_NR];
memset(nr, 0, sizeof(nr));
for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
nr[c->open_buckets[i].type]++;
return scnprintf(buf, PAGE_SIZE, return scnprintf(buf, PAGE_SIZE,
"free_inc: %zu/%zu\n" "free_inc: %zu/%zu\n"
@ -823,7 +840,10 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
" copygc threshold: %llu\n" " copygc threshold: %llu\n"
"freelist_wait: %s\n" "freelist_wait: %s\n"
"open buckets: %u/%u (reserved %u)\n" "open buckets: %u/%u (reserved %u)\n"
"open_buckets_wait: %s\n", "open_buckets_wait: %s\n"
"open_buckets_btree: %u\n"
"open_buckets_user: %u\n"
"btree reserve cache: %u\n",
fifo_used(&ca->free_inc), ca->free_inc.size, fifo_used(&ca->free_inc), ca->free_inc.size,
fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size, fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size,
fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
@ -845,8 +865,12 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
stats.sectors_fragmented, stats.sectors_fragmented,
ca->copygc_threshold, ca->copygc_threshold,
c->freelist_wait.list.first ? "waiting" : "empty", c->freelist_wait.list.first ? "waiting" : "empty",
c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE, c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
c->open_buckets_wait.list.first ? "waiting" : "empty"); BTREE_NODE_OPEN_BUCKET_RESERVE,
c->open_buckets_wait.list.first ? "waiting" : "empty",
nr[BCH_DATA_BTREE],
nr[BCH_DATA_USER],
c->btree_reserve_cache_nr);
} }
static const char * const bch2_rw[] = { static const char * const bch2_rw[] = {

View File

@ -133,6 +133,7 @@ void bch2_flags_to_text(struct printbuf *out,
const char * const list[], u64 flags) const char * const list[], u64 flags)
{ {
unsigned bit, nr = 0; unsigned bit, nr = 0;
bool first = true;
if (out->pos != out->end) if (out->pos != out->end)
*out->pos = '\0'; *out->pos = '\0';
@ -141,7 +142,10 @@ void bch2_flags_to_text(struct printbuf *out,
nr++; nr++;
while (flags && (bit = __ffs(flags)) < nr) { while (flags && (bit = __ffs(flags)) < nr) {
pr_buf(out, "%s,", list[bit]); pr_buf(out, "%s", list[bit]);
if (!first)
pr_buf(out, ",");
first = false;
flags ^= 1 << bit; flags ^= 1 << bit;
} }
} }
@ -894,3 +898,28 @@ void eytzinger0_find_test(void)
kfree(test_array); kfree(test_array);
} }
#endif #endif
/*
* Accumulate percpu counters onto one cpu's copy - only valid when access
* against any percpu counter is guarded against
*/
u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
{
u64 *ret;
int cpu;
preempt_disable();
ret = this_cpu_ptr(p);
preempt_enable();
for_each_possible_cpu(cpu) {
u64 *i = per_cpu_ptr(p, cpu);
if (i != ret) {
acc_u64s(ret, i, nr);
memset(i, 0, nr * sizeof(u64));
}
}
return ret;
}

View File

@ -715,4 +715,6 @@ static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
acc_u64s(acc, per_cpu_ptr(src, cpu), nr); acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
} }
u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
#endif /* _BCACHEFS_UTIL_H */ #endif /* _BCACHEFS_UTIL_H */

View File

@ -1,4 +1,5 @@
#include <linux/atomic.h>
#include <linux/export.h> #include <linux/export.h>
#include <linux/generic-radix-tree.h> #include <linux/generic-radix-tree.h>
#include <linux/gfp.h> #include <linux/gfp.h>
@ -16,7 +17,7 @@ struct genradix_node {
}; };
}; };
static inline unsigned genradix_depth_shift(unsigned depth) static inline int genradix_depth_shift(unsigned depth)
{ {
return PAGE_SHIFT + GENRADIX_ARY_SHIFT * depth; return PAGE_SHIFT + GENRADIX_ARY_SHIFT * depth;
} }
@ -29,16 +30,34 @@ static inline size_t genradix_depth_size(unsigned depth)
return 1UL << genradix_depth_shift(depth); return 1UL << genradix_depth_shift(depth);
} }
/* depth that's needed for a genradix that can address up to ULONG_MAX: */
#define GENRADIX_MAX_DEPTH \
DIV_ROUND_UP(BITS_PER_LONG - PAGE_SHIFT, GENRADIX_ARY_SHIFT)
#define GENRADIX_DEPTH_MASK \
((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
unsigned genradix_root_to_depth(struct genradix_root *r)
{
return (unsigned long) r & GENRADIX_DEPTH_MASK;
}
struct genradix_node *genradix_root_to_node(struct genradix_root *r)
{
return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
}
/* /*
* Returns pointer to the specified byte @offset within @radix, or NULL if not * Returns pointer to the specified byte @offset within @radix, or NULL if not
* allocated * allocated
*/ */
void *__genradix_ptr(struct __genradix *radix, size_t offset) void *__genradix_ptr(struct __genradix *radix, size_t offset)
{ {
size_t level = radix->depth; struct genradix_root *r = READ_ONCE(radix->root);
struct genradix_node *n = radix->root; struct genradix_node *n = genradix_root_to_node(r);
unsigned level = genradix_root_to_depth(r);
if (offset >= genradix_depth_size(radix->depth)) if (ilog2(offset) >= genradix_depth_shift(level))
return NULL; return NULL;
while (1) { while (1) {
@ -64,43 +83,60 @@ EXPORT_SYMBOL(__genradix_ptr);
void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset, void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
gfp_t gfp_mask) gfp_t gfp_mask)
{ {
struct genradix_node **n; struct genradix_root *v = READ_ONCE(radix->root);
size_t level; struct genradix_node *n, *new_node = NULL;
unsigned level;
/* Increase tree depth if necessary: */ /* Increase tree depth if necessary: */
while (offset >= genradix_depth_size(radix->depth)) {
struct genradix_node *new_root =
(void *) __get_free_page(gfp_mask|__GFP_ZERO);
if (!new_root)
return NULL;
new_root->children[0] = radix->root;
radix->root = new_root;
radix->depth++;
}
n = &radix->root;
level = radix->depth;
while (1) { while (1) {
if (!*n) { struct genradix_root *r = v, *new_root;
*n = (void *) __get_free_page(gfp_mask|__GFP_ZERO);
if (!*n) n = genradix_root_to_node(r);
level = genradix_root_to_depth(r);
if (n && ilog2(offset) < genradix_depth_shift(level))
break;
if (!new_node) {
new_node = (void *)
__get_free_page(gfp_mask|__GFP_ZERO);
if (!new_node)
return NULL; return NULL;
} }
if (!level) new_node->children[0] = n;
break; new_root = ((struct genradix_root *)
((unsigned long) new_node | (n ? level + 1 : 0)));
level--; if ((v = cmpxchg_release(&radix->root, r, new_root)) == r) {
v = new_root;
n = &(*n)->children[offset >> genradix_depth_shift(level)]; new_node = NULL;
offset &= genradix_depth_size(level) - 1; }
} }
return &(*n)->data[offset]; while (level--) {
struct genradix_node **p =
&n->children[offset >> genradix_depth_shift(level)];
offset &= genradix_depth_size(level) - 1;
n = READ_ONCE(*p);
if (!n) {
if (!new_node) {
new_node = (void *)
__get_free_page(gfp_mask|__GFP_ZERO);
if (!new_node)
return NULL;
}
if (!(n = cmpxchg_release(p, NULL, new_node)))
swap(n, new_node);
}
}
if (new_node)
free_page((unsigned long) new_node);
return &n->data[offset];
} }
EXPORT_SYMBOL(__genradix_ptr_alloc); EXPORT_SYMBOL(__genradix_ptr_alloc);
@ -108,17 +144,19 @@ void *__genradix_iter_peek(struct genradix_iter *iter,
struct __genradix *radix, struct __genradix *radix,
size_t objs_per_page) size_t objs_per_page)
{ {
struct genradix_root *r;
struct genradix_node *n; struct genradix_node *n;
size_t level, i; unsigned level, i;
if (!radix->root)
return NULL;
restart: restart:
if (iter->offset >= genradix_depth_size(radix->depth)) r = READ_ONCE(radix->root);
if (!r)
return NULL; return NULL;
n = radix->root; n = genradix_root_to_node(r);
level = radix->depth; level = genradix_root_to_depth(r);
if (ilog2(iter->offset) >= genradix_depth_shift(level))
return NULL;
while (level) { while (level) {
level--; level--;
@ -157,11 +195,24 @@ static void genradix_free_recurse(struct genradix_node *n, unsigned level)
free_page((unsigned long) n); free_page((unsigned long) n);
} }
int __genradix_prealloc(struct __genradix *radix, size_t size,
gfp_t gfp_mask)
{
size_t offset;
for (offset = 0; offset < size; offset += PAGE_SIZE)
if (!__genradix_ptr_alloc(radix, offset, gfp_mask))
return -ENOMEM;
return 0;
}
EXPORT_SYMBOL(__genradix_prealloc);
void __genradix_free(struct __genradix *radix) void __genradix_free(struct __genradix *radix)
{ {
genradix_free_recurse(radix->root, radix->depth); struct genradix_root *r = xchg(&radix->root, NULL);
radix->root = NULL; genradix_free_recurse(genradix_root_to_node(r),
radix->depth = 0; genradix_root_to_depth(r));
} }
EXPORT_SYMBOL(__genradix_free); EXPORT_SYMBOL(__genradix_free);