Update bcachefs sources to 6a25f7a00d bcachefs: fix ioctl code

This commit is contained in:
Kent Overstreet 2017-06-13 17:06:05 -08:00
parent 914c4d19ed
commit 38f22164a9
49 changed files with 2927 additions and 2806 deletions

View File

@ -1 +1 @@
14e9ac5016803fc63c1216608c866bef16b4053e 6a25f7a00d08c45b35bed3d649c05286ec60f7f6

View File

@ -69,7 +69,8 @@ SRCS=bcachefs.c \
libbcachefs/btree_gc.c \ libbcachefs/btree_gc.c \
libbcachefs/btree_io.c \ libbcachefs/btree_io.c \
libbcachefs/btree_iter.c \ libbcachefs/btree_iter.c \
libbcachefs/btree_update.c \ libbcachefs/btree_update_interior.c\
libbcachefs/btree_update_leaf.c \
libbcachefs/buckets.c \ libbcachefs/buckets.c \
libbcachefs/checksum.c \ libbcachefs/checksum.c \
libbcachefs/clock.c \ libbcachefs/clock.c \

View File

@ -24,6 +24,7 @@
#include <linux/dcache.h> #include <linux/dcache.h>
#include <linux/generic-radix-tree.h> #include <linux/generic-radix-tree.h>
#include <linux/xattr.h> #include <linux/xattr.h>
#include "bcachefs.h"
#include "btree_update.h" #include "btree_update.h"
#include "buckets.h" #include "buckets.h"
#include "dirent.h" #include "dirent.h"

View File

@ -38,6 +38,14 @@ static inline void set_bit(long nr, volatile unsigned long *addr)
__atomic_or_fetch(p, mask, __ATOMIC_RELAXED); __atomic_or_fetch(p, mask, __ATOMIC_RELAXED);
} }
static inline void __clear_bit(int nr, volatile unsigned long *addr)
{
unsigned long mask = BIT_MASK(nr);
unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
*p &= ~mask;
}
static inline void clear_bit(long nr, volatile unsigned long *addr) static inline void clear_bit(long nr, volatile unsigned long *addr)
{ {
unsigned long mask = BIT_MASK(nr); unsigned long mask = BIT_MASK(nr);

View File

@ -90,6 +90,8 @@ do { \
__wait_event(wq, condition); \ __wait_event(wq, condition); \
} while (0) } while (0)
#define wait_event_killable(wq, condition) ({wait_event(wq, condition); 0; })
#define __wait_event_timeout(wq, condition, timeout) \ #define __wait_event_timeout(wq, condition, timeout) \
___wait_event(wq, ___wait_cond_timeout(condition), \ ___wait_event(wq, ___wait_cond_timeout(condition), \
TASK_UNINTERRUPTIBLE, 0, timeout, \ TASK_UNINTERRUPTIBLE, 0, timeout, \

View File

@ -87,7 +87,7 @@ DECLARE_EVENT_CLASS(bio,
), ),
TP_fast_assign( TP_fast_assign(
__entry->dev = bio->bi_bdev->bd_dev; __entry->dev = bio->bi_bdev ? bio->bi_bdev->bd_dev : 0;
__entry->sector = bio->bi_iter.bi_sector; __entry->sector = bio->bi_iter.bi_sector;
__entry->nr_sector = bio->bi_iter.bi_size >> 9; __entry->nr_sector = bio->bi_iter.bi_size >> 9;
blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);

View File

@ -146,17 +146,17 @@ static void pd_controllers_update(struct work_struct *work)
u64 size = (ca->mi.nbuckets - u64 size = (ca->mi.nbuckets -
ca->mi.first_bucket) << bucket_bits; ca->mi.first_bucket) << bucket_bits;
u64 dirty = stats.buckets_dirty << bucket_bits; u64 dirty = stats.buckets[S_DIRTY] << bucket_bits;
u64 free = __dev_buckets_free(ca, stats) << bucket_bits; u64 free = __dev_buckets_free(ca, stats) << bucket_bits;
/* /*
* Bytes of internal fragmentation, which can be * Bytes of internal fragmentation, which can be
* reclaimed by copy GC * reclaimed by copy GC
*/ */
s64 fragmented = ((stats.buckets_dirty + s64 fragmented = ((stats.buckets[S_DIRTY] +
stats.buckets_cached) << stats.buckets_cached) <<
bucket_bits) - bucket_bits) -
((stats.sectors[S_DIRTY] + ((stats.sectors[S_DIRTY] +
stats.sectors[S_CACHED] ) << 9); stats.sectors_cached) << 9);
fragmented = max(0LL, fragmented); fragmented = max(0LL, fragmented);
@ -912,7 +912,7 @@ static int bch2_allocator_thread(void *arg)
bucket = fifo_peek(&ca->free_inc); bucket = fifo_peek(&ca->free_inc);
discard_invalidated_bucket(ca, bucket); discard_invalidated_bucket(ca, bucket);
if (kthread_should_stop()) if (kthread_should_stop())
goto out; return 0;
--ca->nr_invalidated; --ca->nr_invalidated;
} }
@ -922,7 +922,7 @@ static int bch2_allocator_thread(void *arg)
journal_seq = 0; journal_seq = 0;
ret = bch2_invalidate_free_inc(c, ca, &journal_seq); ret = bch2_invalidate_free_inc(c, ca, &journal_seq);
if (ret < 0) if (ret < 0)
goto out; return 0;
ca->nr_invalidated = ret; ca->nr_invalidated = ret;
@ -944,7 +944,7 @@ static int bch2_allocator_thread(void *arg)
down_read(&c->gc_lock); down_read(&c->gc_lock);
if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) { if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
up_read(&c->gc_lock); up_read(&c->gc_lock);
goto out; return 0;
} }
while (1) { while (1) {
@ -973,7 +973,7 @@ static int bch2_allocator_thread(void *arg)
if (wait_buckets_available(c, ca)) { if (wait_buckets_available(c, ca)) {
up_read(&c->gc_lock); up_read(&c->gc_lock);
goto out; return 0;
} }
} }
up_read(&c->gc_lock); up_read(&c->gc_lock);
@ -992,13 +992,6 @@ static int bch2_allocator_thread(void *arg)
* write out the new bucket gens: * write out the new bucket gens:
*/ */
} }
out:
/*
* Avoid a race with bch2_usage_update() trying to wake us up after
* we've exited:
*/
synchronize_rcu();
return 0;
} }
/* Allocation */ /* Allocation */
@ -1892,18 +1885,20 @@ void bch2_dev_allocator_stop(struct bch_dev *ca)
struct task_struct *p = ca->alloc_thread; struct task_struct *p = ca->alloc_thread;
ca->alloc_thread = NULL; ca->alloc_thread = NULL;
smp_wmb();
/* /*
* We need an rcu barrier between setting ca->alloc_thread = NULL and * We need an rcu barrier between setting ca->alloc_thread = NULL and
* the thread shutting down to avoid a race with bch2_usage_update() - * the thread shutting down to avoid bch2_wake_allocator() racing:
* the allocator thread itself does a synchronize_rcu() on exit.
* *
* XXX: it would be better to have the rcu barrier be asynchronous * XXX: it would be better to have the rcu barrier be asynchronous
* instead of blocking us here * instead of blocking us here
*/ */
if (p) synchronize_rcu();
if (p) {
kthread_stop(p); kthread_stop(p);
put_task_struct(p);
}
} }
/* start allocator thread: */ /* start allocator thread: */
@ -1917,11 +1912,13 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
if (ca->alloc_thread) if (ca->alloc_thread)
return 0; return 0;
p = kthread_run(bch2_allocator_thread, ca, "bcache_allocator"); p = kthread_create(bch2_allocator_thread, ca, "bcache_allocator");
if (IS_ERR(p)) if (IS_ERR(p))
return PTR_ERR(p); return PTR_ERR(p);
get_task_struct(p);
ca->alloc_thread = p; ca->alloc_thread = p;
wake_up_process(p);
return 0; return 0;
} }

View File

@ -282,7 +282,6 @@ do { \
#include "alloc_types.h" #include "alloc_types.h"
#include "buckets_types.h" #include "buckets_types.h"
#include "clock_types.h" #include "clock_types.h"
#include "io_types.h"
#include "journal_types.h" #include "journal_types.h"
#include "keylist_types.h" #include "keylist_types.h"
#include "move_types.h" #include "move_types.h"
@ -365,6 +364,7 @@ struct bch_dev {
char name[BDEVNAME_SIZE]; char name[BDEVNAME_SIZE];
struct bcache_superblock disk_sb; struct bcache_superblock disk_sb;
int sb_write_error;
struct dev_group self; struct dev_group self;
@ -721,10 +721,6 @@ struct bch_fs {
atomic64_t key_version; atomic64_t key_version;
struct bio_list read_retry_list;
struct work_struct read_retry_work;
spinlock_t read_retry_lock;
struct bio_list btree_write_error_list; struct bio_list btree_write_error_list;
struct work_struct btree_write_error_work; struct work_struct btree_write_error_work;
spinlock_t btree_write_error_lock; spinlock_t btree_write_error_lock;

View File

@ -27,9 +27,18 @@ const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
if (k.k->u64s < BKEY_U64s) if (k.k->u64s < BKEY_U64s)
return "u64s too small"; return "u64s too small";
if (k.k->size && if (!ops->is_extents) {
(bkey_deleted(k.k) || !ops->is_extents)) if (k.k->size)
return "nonzero size field"; return "nonzero size field";
} else {
if ((k.k->size == 0) != bkey_deleted(k.k))
return "bad size field";
}
if (ops->is_extents &&
!k.k->size &&
!bkey_deleted(k.k))
return "zero size field";
switch (k.k->type) { switch (k.k->type) {
case KEY_TYPE_DELETED: case KEY_TYPE_DELETED:

View File

@ -539,12 +539,12 @@ err:
} }
/* Slowpath, don't want it inlined into btree_iter_traverse() */ /* Slowpath, don't want it inlined into btree_iter_traverse() */
static noinline struct btree *bch2_btree_node_fill(struct btree_iter *iter, static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
struct btree_iter *iter,
const struct bkey_i *k, const struct bkey_i *k,
unsigned level, unsigned level,
enum six_lock_type lock_type) enum six_lock_type lock_type)
{ {
struct bch_fs *c = iter->c;
struct btree *b; struct btree *b;
/* /*
@ -603,7 +603,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_iter *iter,
* The btree node will have either a read or a write lock held, depending on * The btree node will have either a read or a write lock held, depending on
* the @write parameter. * the @write parameter.
*/ */
struct btree *bch2_btree_node_get(struct btree_iter *iter, struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
const struct bkey_i *k, unsigned level, const struct bkey_i *k, unsigned level,
enum six_lock_type lock_type) enum six_lock_type lock_type)
{ {
@ -613,7 +613,7 @@ struct btree *bch2_btree_node_get(struct btree_iter *iter,
BUG_ON(level >= BTREE_MAX_DEPTH); BUG_ON(level >= BTREE_MAX_DEPTH);
retry: retry:
rcu_read_lock(); rcu_read_lock();
b = mca_find(iter->c, k); b = mca_find(c, k);
rcu_read_unlock(); rcu_read_unlock();
if (unlikely(!b)) { if (unlikely(!b)) {
@ -622,7 +622,7 @@ retry:
* else we could read in a btree node from disk that's been * else we could read in a btree node from disk that's been
* freed: * freed:
*/ */
b = bch2_btree_node_fill(iter, k, level, lock_type); b = bch2_btree_node_fill(c, iter, k, level, lock_type);
/* We raced and found the btree node in the cache */ /* We raced and found the btree node in the cache */
if (!b) if (!b)
@ -706,10 +706,61 @@ retry:
return b; return b;
} }
void bch2_btree_node_prefetch(struct btree_iter *iter, struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
const struct bkey_i *k, unsigned level) struct btree_iter *iter,
struct btree *b,
enum btree_node_sibling sib)
{
struct btree *parent;
struct btree_node_iter node_iter;
struct bkey_packed *k;
BKEY_PADDED(k) tmp;
struct btree *ret;
unsigned level = b->level;
parent = iter->nodes[level + 1];
if (!parent)
return NULL;
if (!bch2_btree_node_relock(iter, level + 1)) {
bch2_btree_iter_set_locks_want(iter, level + 2);
return ERR_PTR(-EINTR);
}
node_iter = iter->node_iters[parent->level];
k = bch2_btree_node_iter_peek_all(&node_iter, parent);
BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
do {
k = sib == btree_prev_sib
? bch2_btree_node_iter_prev_all(&node_iter, parent)
: (bch2_btree_node_iter_advance(&node_iter, parent),
bch2_btree_node_iter_peek_all(&node_iter, parent));
if (!k)
return NULL;
} while (bkey_deleted(k));
bch2_bkey_unpack(parent, &tmp.k, k);
ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent);
if (IS_ERR(ret) && PTR_ERR(ret) == -EINTR) {
btree_node_unlock(iter, level);
ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent);
}
if (!IS_ERR(ret) && !bch2_btree_node_relock(iter, level)) {
six_unlock_intent(&ret->lock);
ret = ERR_PTR(-EINTR);
}
return ret;
}
void bch2_btree_node_prefetch(struct bch_fs *c, const struct bkey_i *k,
unsigned level, enum btree_id btree_id)
{ {
struct bch_fs *c = iter->c;
struct btree *b; struct btree *b;
BUG_ON(level >= BTREE_MAX_DEPTH); BUG_ON(level >= BTREE_MAX_DEPTH);
@ -726,7 +777,7 @@ void bch2_btree_node_prefetch(struct btree_iter *iter,
return; return;
bkey_copy(&b->key, k); bkey_copy(&b->key, k);
if (bch2_btree_node_hash_insert(c, b, level, iter->btree_id)) { if (bch2_btree_node_hash_insert(c, b, level, btree_id)) {
/* raced with another fill: */ /* raced with another fill: */
/* mark as unhashed... */ /* mark as unhashed... */

View File

@ -21,11 +21,16 @@ int bch2_btree_node_cannibalize_lock(struct bch_fs *, struct closure *);
struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
struct btree *bch2_btree_node_get(struct btree_iter *, const struct bkey_i *, struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
unsigned, enum six_lock_type); const struct bkey_i *, unsigned,
enum six_lock_type);
void bch2_btree_node_prefetch(struct btree_iter *, const struct bkey_i *, struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
unsigned); struct btree *,
enum btree_node_sibling);
void bch2_btree_node_prefetch(struct bch_fs *, const struct bkey_i *,
unsigned, enum btree_id);
void bch2_fs_btree_exit(struct bch_fs *); void bch2_fs_btree_exit(struct bch_fs *);
int bch2_fs_btree_init(struct bch_fs *); int bch2_fs_btree_init(struct bch_fs *);

View File

@ -7,7 +7,7 @@
#include "alloc.h" #include "alloc.h"
#include "bkey_methods.h" #include "bkey_methods.h"
#include "btree_locking.h" #include "btree_locking.h"
#include "btree_update.h" #include "btree_update_interior.h"
#include "btree_io.h" #include "btree_io.h"
#include "btree_gc.h" #include "btree_gc.h"
#include "buckets.h" #include "buckets.h"
@ -112,14 +112,14 @@ u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
* For runtime mark and sweep: * For runtime mark and sweep:
*/ */
static u8 bch2_btree_mark_key(struct bch_fs *c, enum bkey_type type, static u8 bch2_btree_mark_key(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k) struct bkey_s_c k, unsigned flags)
{ {
switch (type) { switch (type) {
case BKEY_TYPE_BTREE: case BKEY_TYPE_BTREE:
bch2_gc_mark_key(c, k, c->sb.btree_node_size, true); bch2_gc_mark_key(c, k, c->sb.btree_node_size, true, flags);
return 0; return 0;
case BKEY_TYPE_EXTENTS: case BKEY_TYPE_EXTENTS:
bch2_gc_mark_key(c, k, k.k->size, false); bch2_gc_mark_key(c, k, k.k->size, false, flags);
return bch2_btree_key_recalc_oldest_gen(c, k); return bch2_btree_key_recalc_oldest_gen(c, k);
default: default:
BUG(); BUG();
@ -151,13 +151,10 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
extent_for_each_ptr(e, ptr) { extent_for_each_ptr(e, ptr) {
struct bch_dev *ca = c->devs[ptr->dev]; struct bch_dev *ca = c->devs[ptr->dev];
struct bucket *g = PTR_BUCKET(ca, ptr); struct bucket *g = PTR_BUCKET(ca, ptr);
struct bucket_mark new;
if (!g->mark.gen_valid) { if (!g->mark.gen_valid) {
bucket_cmpxchg(g, new, ({ g->_mark.gen = ptr->gen;
new.gen = ptr->gen; g->_mark.gen_valid = 1;
new.gen_valid = 1;
}));
ca->need_alloc_write = true; ca->need_alloc_write = true;
} }
@ -166,10 +163,8 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
type == BKEY_TYPE_BTREE type == BKEY_TYPE_BTREE
? "btree" : "data", ? "btree" : "data",
ptr->gen, g->mark.gen)) { ptr->gen, g->mark.gen)) {
bucket_cmpxchg(g, new, ({ g->_mark.gen = ptr->gen;
new.gen = ptr->gen; g->_mark.gen_valid = 1;
new.gen_valid = 1;
}));
ca->need_alloc_write = true; ca->need_alloc_write = true;
set_bit(BCH_FS_FIXED_GENS, &c->flags); set_bit(BCH_FS_FIXED_GENS, &c->flags);
} }
@ -184,13 +179,14 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
max_t(u64, k.k->version.lo, max_t(u64, k.k->version.lo,
atomic64_read(&c->key_version))); atomic64_read(&c->key_version)));
bch2_btree_mark_key(c, type, k); bch2_btree_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
fsck_err: fsck_err:
return ret; return ret;
} }
static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b) static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
{ {
enum bkey_type type = btree_node_type(b);
struct btree_node_iter iter; struct btree_node_iter iter;
struct bkey unpacked; struct bkey unpacked;
struct bkey_s_c k; struct bkey_s_c k;
@ -201,8 +197,7 @@ static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
btree_node_is_extents(b), btree_node_is_extents(b),
&unpacked) { &unpacked) {
bch2_bkey_debugcheck(c, b, k); bch2_bkey_debugcheck(c, b, k);
stale = max(stale, bch2_btree_mark_key(c, stale = max(stale, bch2_btree_mark_key(c, type, k, 0));
btree_node_type(b), k));
} }
return stale; return stale;
@ -269,7 +264,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
mutex_lock(&c->btree_root_lock); mutex_lock(&c->btree_root_lock);
b = c->btree_roots[btree_id].b; b = c->btree_roots[btree_id].b;
bch2_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key)); bch2_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
gc_pos_set(c, gc_pos_btree_root(b->btree_id)); gc_pos_set(c, gc_pos_btree_root(b->btree_id));
mutex_unlock(&c->btree_root_lock); mutex_unlock(&c->btree_root_lock);
@ -379,7 +374,7 @@ static void bch2_mark_metadata(struct bch_fs *c)
static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
{ {
struct bch_fs_usage stats = { 0 }; struct bch_fs_usage stats = { 0 };
struct btree_interior_update *as; struct btree_update *as;
struct pending_btree_node_free *d; struct pending_btree_node_free *d;
mutex_lock(&c->btree_interior_update_lock); mutex_lock(&c->btree_interior_update_lock);
@ -387,9 +382,10 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
for_each_pending_btree_node_free(c, as, d) for_each_pending_btree_node_free(c, as, d)
if (d->index_update_done) if (d->index_update_done)
__bch2_gc_mark_key(c, bkey_i_to_s_c(&d->key), __bch2_mark_key(c, bkey_i_to_s_c(&d->key),
c->sb.btree_node_size, true, c->sb.btree_node_size, true,
&stats); &stats, 0,
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
/* /*
* Don't apply stats - pending deletes aren't tracked in * Don't apply stats - pending deletes aren't tracked in
* bch_alloc_stats: * bch_alloc_stats:
@ -430,7 +426,6 @@ void bch2_gc_start(struct bch_fs *c)
per_cpu_ptr(c->usage_percpu, cpu); per_cpu_ptr(c->usage_percpu, cpu);
memset(p->s, 0, sizeof(p->s)); memset(p->s, 0, sizeof(p->s));
p->persistent_reserved = 0;
} }
lg_global_unlock(&c->usage_lock); lg_global_unlock(&c->usage_lock);
@ -551,16 +546,14 @@ static void recalc_packed_keys(struct btree *b)
btree_keys_account_key_add(&b->nr, 0, k); btree_keys_account_key_add(&b->nr, 0, k);
} }
static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES], static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
struct btree_iter *iter) struct btree *old_nodes[GC_MERGE_NODES])
{ {
struct btree *parent = iter->nodes[old_nodes[0]->level + 1]; struct btree *parent = iter->nodes[old_nodes[0]->level + 1];
struct bch_fs *c = iter->c;
unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0; unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
unsigned blocks = btree_blocks(c) * 2 / 3; unsigned blocks = btree_blocks(c) * 2 / 3;
struct btree *new_nodes[GC_MERGE_NODES]; struct btree *new_nodes[GC_MERGE_NODES];
struct btree_interior_update *as; struct btree_update *as;
struct btree_reserve *res;
struct keylist keylist; struct keylist keylist;
struct bkey_format_state format_state; struct bkey_format_state format_state;
struct bkey_format new_format; struct bkey_format new_format;
@ -580,23 +573,6 @@ static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks) DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks)
return; return;
res = bch2_btree_reserve_get(c, parent, nr_old_nodes,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE,
NULL);
if (IS_ERR(res)) {
trace_btree_gc_coalesce_fail(c,
BTREE_GC_COALESCE_FAIL_RESERVE_GET);
return;
}
if (bch2_keylist_realloc(&keylist, NULL, 0,
(BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
trace_btree_gc_coalesce_fail(c,
BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
goto out;
}
/* Find a format that all keys in @old_nodes can pack into */ /* Find a format that all keys in @old_nodes can pack into */
bch2_bkey_format_init(&format_state); bch2_bkey_format_init(&format_state);
@ -610,21 +586,38 @@ static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) { if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) {
trace_btree_gc_coalesce_fail(c, trace_btree_gc_coalesce_fail(c,
BTREE_GC_COALESCE_FAIL_FORMAT_FITS); BTREE_GC_COALESCE_FAIL_FORMAT_FITS);
goto out; return;
} }
if (bch2_keylist_realloc(&keylist, NULL, 0,
(BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
trace_btree_gc_coalesce_fail(c,
BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
return;
}
as = bch2_btree_update_start(c, iter->btree_id,
btree_update_reserve_required(c, parent) + nr_old_nodes,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE,
NULL);
if (IS_ERR(as)) {
trace_btree_gc_coalesce_fail(c,
BTREE_GC_COALESCE_FAIL_RESERVE_GET);
bch2_keylist_free(&keylist, NULL);
return;
}
trace_btree_gc_coalesce(c, parent, nr_old_nodes); trace_btree_gc_coalesce(c, parent, nr_old_nodes);
as = bch2_btree_interior_update_alloc(c);
for (i = 0; i < nr_old_nodes; i++) for (i = 0; i < nr_old_nodes; i++)
bch2_btree_interior_update_will_free_node(c, as, old_nodes[i]); bch2_btree_interior_update_will_free_node(as, old_nodes[i]);
/* Repack everything with @new_format and sort down to one bset */ /* Repack everything with @new_format and sort down to one bset */
for (i = 0; i < nr_old_nodes; i++) for (i = 0; i < nr_old_nodes; i++)
new_nodes[i] = new_nodes[i] =
__bch2_btree_node_alloc_replacement(c, old_nodes[i], __bch2_btree_node_alloc_replacement(as, old_nodes[i],
new_format, as, res); new_format);
/* /*
* Conceptually we concatenate the nodes together and slice them * Conceptually we concatenate the nodes together and slice them
@ -738,7 +731,7 @@ next:
bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key); bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key);
/* Insert the newly coalesced nodes */ /* Insert the newly coalesced nodes */
bch2_btree_insert_node(parent, iter, &keylist, res, as); bch2_btree_insert_node(as, parent, iter, &keylist);
BUG_ON(!bch2_keylist_empty(&keylist)); BUG_ON(!bch2_keylist_empty(&keylist));
@ -751,7 +744,7 @@ next:
/* Free the old nodes and update our sliding window */ /* Free the old nodes and update our sliding window */
for (i = 0; i < nr_old_nodes; i++) { for (i = 0; i < nr_old_nodes; i++) {
bch2_btree_node_free_inmem(iter, old_nodes[i]); bch2_btree_node_free_inmem(c, old_nodes[i], iter);
six_unlock_intent(&old_nodes[i]->lock); six_unlock_intent(&old_nodes[i]->lock);
/* /*
@ -768,9 +761,9 @@ next:
six_unlock_intent(&new_nodes[i]->lock); six_unlock_intent(&new_nodes[i]->lock);
} }
} }
out:
bch2_btree_update_done(as);
bch2_keylist_free(&keylist, NULL); bch2_keylist_free(&keylist, NULL);
bch2_btree_reserve_put(c, res);
} }
static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
@ -814,7 +807,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
} }
memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0])); memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));
bch2_coalesce_nodes(merge, &iter); bch2_coalesce_nodes(c, &iter, merge);
for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) { for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
lock_seq[i] = merge[i]->lock.state.seq; lock_seq[i] = merge[i]->lock.state.seq;

View File

@ -2,10 +2,11 @@
#include "bcachefs.h" #include "bcachefs.h"
#include "bkey_methods.h" #include "bkey_methods.h"
#include "btree_cache.h" #include "btree_cache.h"
#include "btree_update.h"
#include "btree_io.h" #include "btree_io.h"
#include "btree_iter.h" #include "btree_iter.h"
#include "btree_locking.h" #include "btree_locking.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "buckets.h" #include "buckets.h"
#include "checksum.h" #include "checksum.h"
#include "debug.h" #include "debug.h"
@ -872,37 +873,37 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, struct nonce nonce)
vstruct_end(i) - (void *) i->_data); vstruct_end(i) - (void *) i->_data);
} }
#define btree_node_error(c, b, ptr, msg, ...) \ #define btree_node_error(c, b, msg, ...) \
do { \ do { \
if (write == READ && \ if (write == READ && \
!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \
mustfix_fsck_err(c, \ mustfix_fsck_err(c, \
"btree node read error at btree %u level %u/%u\n"\ "btree node read error at btree %u level %u/%u\n"\
"sector %llu node offset %u bset u64s %u: " msg,\ "pos %llu:%llu node offset %u bset u64s %u: " msg,\
(b)->btree_id, (b)->level, \ (b)->btree_id, (b)->level, \
(c)->btree_roots[(b)->btree_id].level, \ (c)->btree_roots[(b)->btree_id].level, \
(u64) ptr->offset, (b)->written, \ (b)->key.k.p.inode, (b)->key.k.p.offset, \
le16_to_cpu((i)->u64s), ##__VA_ARGS__); \ (b)->written, le16_to_cpu((i)->u64s), \
##__VA_ARGS__); \
} else { \ } else { \
bch_err(c, "%s at btree %u level %u/%u\n" \ bch_err(c, "%s at btree %u level %u/%u\n" \
"sector %llu node offset %u bset u64s %u: " msg,\ "pos %llu:%llu node offset %u bset u64s %u: " msg,\
write == WRITE \ write == WRITE \
? "corrupt metadata in btree node write" \ ? "corrupt metadata in btree node write" \
: "btree node error", \ : "btree node error", \
(b)->btree_id, (b)->level, \ (b)->btree_id, (b)->level, \
(c)->btree_roots[(b)->btree_id].level, \ (c)->btree_roots[(b)->btree_id].level, \
(u64) ptr->offset, (b)->written, \ (b)->key.k.p.inode, (b)->key.k.p.offset, \
le16_to_cpu((i)->u64s), ##__VA_ARGS__); \ (b)->written, le16_to_cpu((i)->u64s), \
##__VA_ARGS__); \
ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ret = BCH_FSCK_ERRORS_NOT_FIXED; \
goto fsck_err; \ goto fsck_err; \
} \ } \
} while (0) } while (0)
static int validate_bset(struct bch_fs *c, struct btree *b, static int validate_bset(struct bch_fs *c, struct btree *b,
const struct bch_extent_ptr *ptr,
struct bset *i, unsigned sectors, struct bset *i, unsigned sectors,
unsigned *whiteout_u64s, unsigned *whiteout_u64s, int write)
int write)
{ {
struct bkey_packed *k, *prev = NULL; struct bkey_packed *k, *prev = NULL;
struct bpos prev_pos = POS_MIN; struct bpos prev_pos = POS_MIN;
@ -910,19 +911,19 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
int ret = 0; int ret = 0;
if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION) { if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION) {
btree_node_error(c, b, ptr, "unsupported bset version"); btree_node_error(c, b, "unsupported bset version");
i->u64s = 0; i->u64s = 0;
return 0; return 0;
} }
if (b->written + sectors > c->sb.btree_node_size) { if (b->written + sectors > c->sb.btree_node_size) {
btree_node_error(c, b, ptr, "bset past end of btree node"); btree_node_error(c, b, "bset past end of btree node");
i->u64s = 0; i->u64s = 0;
return 0; return 0;
} }
if (b->written && !i->u64s) if (b->written && !i->u64s)
btree_node_error(c, b, ptr, "empty set"); btree_node_error(c, b, "empty set");
if (!BSET_SEPARATE_WHITEOUTS(i)) { if (!BSET_SEPARATE_WHITEOUTS(i)) {
seen_non_whiteout = true; seen_non_whiteout = true;
@ -936,7 +937,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
const char *invalid; const char *invalid;
if (!k->u64s) { if (!k->u64s) {
btree_node_error(c, b, ptr, btree_node_error(c, b,
"KEY_U64s 0: %zu bytes of metadata lost", "KEY_U64s 0: %zu bytes of metadata lost",
vstruct_end(i) - (void *) k); vstruct_end(i) - (void *) k);
@ -945,7 +946,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
} }
if (bkey_next(k) > vstruct_last(i)) { if (bkey_next(k) > vstruct_last(i)) {
btree_node_error(c, b, ptr, btree_node_error(c, b,
"key extends past end of bset"); "key extends past end of bset");
i->u64s = cpu_to_le16((u64 *) k - i->_data); i->u64s = cpu_to_le16((u64 *) k - i->_data);
@ -953,7 +954,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
} }
if (k->format > KEY_FORMAT_CURRENT) { if (k->format > KEY_FORMAT_CURRENT) {
btree_node_error(c, b, ptr, btree_node_error(c, b,
"invalid bkey format %u", k->format); "invalid bkey format %u", k->format);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
@ -973,7 +974,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
bch2_bkey_val_to_text(c, btree_node_type(b), bch2_bkey_val_to_text(c, btree_node_type(b),
buf, sizeof(buf), u); buf, sizeof(buf), u);
btree_node_error(c, b, ptr, btree_node_error(c, b,
"invalid bkey %s: %s", buf, invalid); "invalid bkey %s: %s", buf, invalid);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
@ -994,7 +995,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
*whiteout_u64s = k->_data - i->_data; *whiteout_u64s = k->_data - i->_data;
seen_non_whiteout = true; seen_non_whiteout = true;
} else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) { } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
btree_node_error(c, b, ptr, btree_node_error(c, b,
"keys out of order: %llu:%llu > %llu:%llu", "keys out of order: %llu:%llu > %llu:%llu",
prev_pos.inode, prev_pos.inode,
prev_pos.offset, prev_pos.offset,
@ -1013,32 +1014,7 @@ fsck_err:
return ret; return ret;
} }
static bool extent_contains_ptr(struct bkey_s_c_extent e, int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b)
struct bch_extent_ptr match)
{
const struct bch_extent_ptr *ptr;
extent_for_each_ptr(e, ptr)
if (!memcmp(ptr, &match, sizeof(*ptr)))
return true;
return false;
}
static void bch2_btree_node_read_complete(struct btree_read_bio *rb,
struct btree *b)
{
struct bch_dev *ca = rb->pick.ca;
bio_put(&rb->bio);
percpu_ref_put(&ca->io_ref);
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
}
void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
struct bch_dev *ca,
const struct bch_extent_ptr *ptr)
{ {
struct btree_node_entry *bne; struct btree_node_entry *bne;
struct bset *i = &b->data->keys; struct bset *i = &b->data->keys;
@ -1049,7 +1025,7 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
const char *err; const char *err;
struct bch_csum csum; struct bch_csum csum;
struct nonce nonce; struct nonce nonce;
int ret, write = READ; int ret, should_retry = 0, write = READ;
iter = mempool_alloc(&c->fill_iter, GFP_NOIO); iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
__bch2_btree_node_iter_init(iter, btree_node_is_extents(b)); __bch2_btree_node_iter_init(iter, btree_node_is_extents(b));
@ -1066,24 +1042,22 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
err = "bad magic"; err = "bad magic";
if (le64_to_cpu(b->data->magic) != bset_magic(c)) if (le64_to_cpu(b->data->magic) != bset_magic(c))
goto err; goto retry_err;
err = "bad btree header"; err = "bad btree header";
if (!b->data->keys.seq) if (!b->data->keys.seq)
goto err; goto retry_err;
err = "unknown checksum type"; err = "unknown checksum type";
if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
goto err; goto retry_err;
/* XXX: retry checksum errors */
nonce = btree_nonce(b, i, b->written << 9); nonce = btree_nonce(b, i, b->written << 9);
csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
err = "bad checksum"; err = "bad checksum";
if (bch2_crc_cmp(csum, b->data->csum)) if (bch2_crc_cmp(csum, b->data->csum))
goto err; goto retry_err;
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
&b->data->flags, &b->data->flags,
@ -1116,12 +1090,19 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
err = "incorrect max key"; err = "incorrect max key";
if (bkey_cmp(b->data->max_key, b->key.k.p)) if (bkey_cmp(b->data->max_key, b->key.k.p))
goto err; goto err;
#if 0
/*
* not correct anymore, due to btree node write error
* handling
*
* need to add b->data->seq to btree keys and verify
* against that
*/
err = "incorrect backpointer"; err = "incorrect backpointer";
if (!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key), if (!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
b->data->ptr)) b->data->ptr))
goto err; goto err;
#endif
err = bch2_bkey_format_validate(&b->data->format); err = bch2_bkey_format_validate(&b->data->format);
if (err) if (err)
goto err; goto err;
@ -1138,22 +1119,21 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
err = "unknown checksum type"; err = "unknown checksum type";
if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
goto err; goto retry_err;
nonce = btree_nonce(b, i, b->written << 9); nonce = btree_nonce(b, i, b->written << 9);
csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
err = "bad checksum"; err = "bad checksum";
if (memcmp(&csum, &bne->csum, sizeof(csum))) if (bch2_crc_cmp(csum, bne->csum))
goto err; goto retry_err;
bset_encrypt(c, i, nonce); bset_encrypt(c, i, nonce);
sectors = vstruct_sectors(bne, c->block_bits); sectors = vstruct_sectors(bne, c->block_bits);
} }
ret = validate_bset(c, b, ptr, i, sectors, ret = validate_bset(c, b, i, sectors, &whiteout_u64s, READ);
&whiteout_u64s, READ);
if (ret) if (ret)
goto fsck_err; goto fsck_err;
@ -1208,40 +1188,79 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
btree_node_reset_sib_u64s(b); btree_node_reset_sib_u64s(b);
out: out:
mempool_free(iter, &c->fill_iter); mempool_free(iter, &c->fill_iter);
return; return should_retry;
err: err:
btree_node_error(c, b, ptr, "%s", err); btree_node_error(c, b, "%s", err);
fsck_err: fsck_err:
bch2_inconsistent_error(c); bch2_inconsistent_error(c);
set_btree_node_read_error(b); set_btree_node_read_error(b);
goto out; goto out;
retry_err:
should_retry = -1;
goto out;
} }
static void btree_node_read_work(struct work_struct *work) static void btree_node_read_work(struct work_struct *work)
{ {
struct btree_read_bio *rb = struct btree_read_bio *rb =
container_of(work, struct btree_read_bio, work); container_of(work, struct btree_read_bio, work);
struct bch_fs *c = rb->c;
struct bch_dev *ca = rb->pick.ca;
struct btree *b = rb->bio.bi_private;
struct bio *bio = &rb->bio;
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
const struct bch_extent_ptr *ptr;
struct bch_devs_mask avoid;
bch2_btree_node_read_done(rb->c, rb->bio.bi_private, bch2_dev_io_err_on(bio->bi_error, rb->pick.ca, "btree read");
rb->pick.ca, &rb->pick.ptr); percpu_ref_put(&rb->pick.ca->io_ref);
bch2_btree_node_read_complete(rb, rb->bio.bi_private);
if (!bio->bi_error &&
!bch2_btree_node_read_done(c, b))
goto out;
goto err;
out:
bch2_time_stats_update(&c->btree_read_time, rb->start_time);
bio_put(&rb->bio);
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
return;
err:
memset(&avoid, 0, sizeof(avoid));
__set_bit(ca->dev_idx, avoid.d);
extent_for_each_ptr(e, ptr) {
memset(&rb->pick, 0, sizeof(rb->pick));
bch2_get_read_device(c, e.k, ptr, NULL, &avoid, &rb->pick);
if (!rb->pick.ca)
continue;
bio_reset(bio);
bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
bio->bi_bdev = rb->pick.ca->disk_sb.bdev;
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
submit_bio_wait(bio);
bch2_dev_io_err_on(bio->bi_error, rb->pick.ca, "btree read");
percpu_ref_put(&rb->pick.ca->io_ref);
if (!bio->bi_error &&
!bch2_btree_node_read_done(c, b))
goto out;
}
set_btree_node_read_error(b);
goto out;
} }
static void btree_node_read_endio(struct bio *bio) static void btree_node_read_endio(struct bio *bio)
{ {
struct btree *b = bio->bi_private;
struct btree_read_bio *rb = struct btree_read_bio *rb =
container_of(bio, struct btree_read_bio, bio); container_of(bio, struct btree_read_bio, bio);
if (bch2_dev_fatal_io_err_on(bio->bi_error,
rb->pick.ca, "IO error reading bucket %zu",
PTR_BUCKET_NR(rb->pick.ca, &rb->pick.ptr)) ||
bch2_meta_read_fault("btree")) {
set_btree_node_read_error(b);
bch2_btree_node_read_complete(rb, rb->bio.bi_private);
return;
}
INIT_WORK(&rb->work, btree_node_read_work); INIT_WORK(&rb->work, btree_node_read_work);
schedule_work(&rb->work); schedule_work(&rb->work);
} }
@ -1249,7 +1268,6 @@ static void btree_node_read_endio(struct bio *bio)
void bch2_btree_node_read(struct bch_fs *c, struct btree *b, void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
bool sync) bool sync)
{ {
uint64_t start_time = local_clock();
struct extent_pick_ptr pick; struct extent_pick_ptr pick;
struct btree_read_bio *rb; struct btree_read_bio *rb;
struct bio *bio; struct bio *bio;
@ -1266,6 +1284,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio); bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
rb = container_of(bio, struct btree_read_bio, bio); rb = container_of(bio, struct btree_read_bio, bio);
rb->c = c; rb->c = c;
rb->start_time = local_clock();
rb->pick = pick; rb->pick = pick;
bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
bio->bi_bdev = pick.ca->disk_sb.bdev; bio->bi_bdev = pick.ca->disk_sb.bdev;
@ -1277,19 +1296,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
if (sync) { if (sync) {
submit_bio_wait(bio); submit_bio_wait(bio);
bio->bi_private = b;
if (bch2_dev_fatal_io_err_on(bio->bi_error, btree_node_read_work(&rb->work);
pick.ca, "IO error reading bucket %zu",
PTR_BUCKET_NR(pick.ca, &pick.ptr)) ||
bch2_meta_read_fault("btree")) {
set_btree_node_read_error(b);
goto out;
}
bch2_btree_node_read_done(c, b, pick.ca, &pick.ptr);
bch2_time_stats_update(&c->btree_read_time, start_time);
out:
bch2_btree_node_read_complete(rb, b);
} else { } else {
bio->bi_end_io = btree_node_read_endio; bio->bi_end_io = btree_node_read_endio;
bio->bi_private = b; bio->bi_private = b;
@ -1327,7 +1335,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
return -EIO; return -EIO;
} }
bch2_btree_set_root_initial(c, b, NULL); bch2_btree_set_root_for_read(c, b);
six_unlock_intent(&b->lock); six_unlock_intent(&b->lock);
return 0; return 0;
@ -1356,7 +1364,15 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
struct bkey_i_extent *new_key; struct bkey_i_extent *new_key;
six_lock_read(&b->lock);
bkey_copy(&tmp.k, &b->key); bkey_copy(&tmp.k, &b->key);
six_unlock_read(&b->lock);
if (!bkey_extent_is_data(&tmp.k.k) || !PTR_HASH(&tmp.k)) {
/* Node has been freed: */
goto out;
}
new_key = bkey_i_to_extent(&tmp.k); new_key = bkey_i_to_extent(&tmp.k);
while (wbio->replicas_failed) { while (wbio->replicas_failed) {
@ -1371,7 +1387,7 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
set_btree_node_noevict(b); set_btree_node_noevict(b);
bch2_fatal_error(c); bch2_fatal_error(c);
} }
out:
bio_put(&wbio->bio); bio_put(&wbio->bio);
btree_node_write_done(c, b); btree_node_write_done(c, b);
if (cl) if (cl)
@ -1385,9 +1401,9 @@ void bch2_btree_write_error_work(struct work_struct *work)
struct bio *bio; struct bio *bio;
while (1) { while (1) {
spin_lock_irq(&c->read_retry_lock); spin_lock_irq(&c->btree_write_error_lock);
bio = bio_list_pop(&c->read_retry_list); bio = bio_list_pop(&c->btree_write_error_list);
spin_unlock_irq(&c->read_retry_lock); spin_unlock_irq(&c->btree_write_error_lock);
if (!bio) if (!bio)
break; break;
@ -1406,7 +1422,7 @@ static void btree_node_write_endio(struct bio *bio)
struct bch_fs *c = wbio->c; struct bch_fs *c = wbio->c;
struct bch_dev *ca = wbio->ca; struct bch_dev *ca = wbio->ca;
if (bch2_dev_nonfatal_io_err_on(bio->bi_error, ca, "btree write") || if (bch2_dev_io_err_on(bio->bi_error, ca, "btree write") ||
bch2_meta_write_fault("btree")) bch2_meta_write_fault("btree"))
set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed); set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);
@ -1428,7 +1444,7 @@ static void btree_node_write_endio(struct bio *bio)
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&c->btree_write_error_lock, flags); spin_lock_irqsave(&c->btree_write_error_lock, flags);
bio_list_add(&c->read_retry_list, &wbio->bio); bio_list_add(&c->btree_write_error_list, &wbio->bio);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags); spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
queue_work(c->wq, &c->btree_write_error_work); queue_work(c->wq, &c->btree_write_error_work);
return; return;
@ -1450,7 +1466,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr) extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr)
break; break;
ret = validate_bset(c, b, ptr, i, sectors, &whiteout_u64s, WRITE); ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE);
if (ret) if (ret)
bch2_inconsistent_error(c); bch2_inconsistent_error(c);

View File

@ -10,6 +10,7 @@ struct btree_iter;
struct btree_read_bio { struct btree_read_bio {
struct bch_fs *c; struct bch_fs *c;
u64 start_time;
struct extent_pick_ptr pick; struct extent_pick_ptr pick;
struct work_struct work; struct work_struct work;
struct bio bio; struct bio bio;
@ -71,11 +72,10 @@ void bch2_btree_build_aux_trees(struct btree *);
void bch2_btree_init_next(struct bch_fs *, struct btree *, void bch2_btree_init_next(struct bch_fs *, struct btree *,
struct btree_iter *); struct btree_iter *);
void bch2_btree_node_read_done(struct bch_fs *, struct btree *, int bch2_btree_node_read_done(struct bch_fs *, struct btree *);
struct bch_dev *, const struct bch_extent_ptr *);
void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
int bch2_btree_root_read(struct bch_fs *, enum btree_id, int bch2_btree_root_read(struct bch_fs *, enum btree_id,
const struct bkey_i *, unsigned); const struct bkey_i *, unsigned);
void bch2_btree_complete_write(struct bch_fs *, struct btree *, void bch2_btree_complete_write(struct bch_fs *, struct btree *,
struct btree_write *); struct btree_write *);

View File

@ -247,14 +247,12 @@ fail:
return false; return false;
} }
static int __bch2_btree_iter_unlock(struct btree_iter *iter) static void __bch2_btree_iter_unlock(struct btree_iter *iter)
{ {
while (iter->nodes_locked) while (iter->nodes_locked)
btree_node_unlock(iter, __ffs(iter->nodes_locked)); btree_node_unlock(iter, __ffs(iter->nodes_locked));
iter->flags &= ~BTREE_ITER_UPTODATE; iter->flags &= ~BTREE_ITER_UPTODATE;
return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
} }
int bch2_btree_iter_unlock(struct btree_iter *iter) int bch2_btree_iter_unlock(struct btree_iter *iter)
@ -263,7 +261,9 @@ int bch2_btree_iter_unlock(struct btree_iter *iter)
for_each_linked_btree_iter(iter, linked) for_each_linked_btree_iter(iter, linked)
__bch2_btree_iter_unlock(linked); __bch2_btree_iter_unlock(linked);
return __bch2_btree_iter_unlock(iter); __bch2_btree_iter_unlock(iter);
return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
} }
/* Btree iterator: */ /* Btree iterator: */
@ -617,13 +617,9 @@ bool bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
void bch2_btree_iter_node_drop_linked(struct btree_iter *iter, struct btree *b) void bch2_btree_iter_node_drop_linked(struct btree_iter *iter, struct btree *b)
{ {
struct btree_iter *linked; struct btree_iter *linked;
unsigned level = b->level;
for_each_linked_btree_iter(iter, linked) for_each_linked_btree_iter(iter, linked)
if (linked->nodes[level] == b) { bch2_btree_iter_node_drop(linked, b);
btree_node_unlock(linked, level);
linked->nodes[level] = BTREE_ITER_NOT_END;
}
} }
void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
@ -631,9 +627,9 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
unsigned level = b->level; unsigned level = b->level;
if (iter->nodes[level] == b) { if (iter->nodes[level] == b) {
BUG_ON(b->lock.state.intent_lock != 1);
btree_node_unlock(iter, level); btree_node_unlock(iter, level);
iter->nodes[level] = BTREE_ITER_NOT_END; iter->nodes[level] = BTREE_ITER_NOT_END;
iter->flags &= ~BTREE_ITER_UPTODATE;
} }
} }
@ -718,7 +714,8 @@ static void btree_iter_prefetch(struct btree_iter *iter)
break; break;
bch2_bkey_unpack(b, &tmp.k, k); bch2_bkey_unpack(b, &tmp.k, k);
bch2_btree_node_prefetch(iter, &tmp.k, iter->level); bch2_btree_node_prefetch(iter->c, &tmp.k,
iter->level, iter->btree_id);
} }
if (!was_locked) if (!was_locked)
@ -735,7 +732,7 @@ static inline int btree_iter_down(struct btree_iter *iter)
bkey_reassemble(&tmp.k, k); bkey_reassemble(&tmp.k, k);
b = bch2_btree_node_get(iter, &tmp.k, level, lock_type); b = bch2_btree_node_get(iter->c, iter, &tmp.k, level, lock_type);
if (unlikely(IS_ERR(b))) if (unlikely(IS_ERR(b)))
return PTR_ERR(b); return PTR_ERR(b);
@ -907,6 +904,8 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
{ {
int ret; int ret;
iter->flags &= ~BTREE_ITER_UPTODATE;
if (unlikely(!iter->nodes[iter->level])) if (unlikely(!iter->nodes[iter->level]))
return 0; return 0;
@ -1064,11 +1063,14 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
struct btree *b = iter->nodes[0]; struct btree *b = iter->nodes[0];
struct bkey_packed *k = struct bkey_packed *k =
__bch2_btree_node_iter_peek_all(&iter->node_iters[0], b); __bch2_btree_node_iter_peek_all(&iter->node_iters[0], b);
struct bkey_s_c ret = {
return (struct bkey_s_c) {
.k = &iter->k, .k = &iter->k,
.v = bkeyp_val(&b->format, k) .v = bkeyp_val(&b->format, k)
}; };
if (debug_check_bkeys(iter->c))
bch2_bkey_debugcheck(iter->c, b, ret);
return ret;
} }
while (1) { while (1) {

View File

@ -10,6 +10,7 @@
*/ */
#include "btree_iter.h" #include "btree_iter.h"
#include "btree_io.h"
#include "six.h" #include "six.h"
/* matches six lock types */ /* matches six lock types */

View File

@ -11,7 +11,7 @@
#include "six.h" #include "six.h"
struct open_bucket; struct open_bucket;
struct btree_interior_update; struct btree_update;
#define MAX_BSETS 3U #define MAX_BSETS 3U
@ -105,7 +105,7 @@ struct btree {
* node to point to them: we update the parent in memory immediately, * node to point to them: we update the parent in memory immediately,
* but then we must wait until the children have been written out before * but then we must wait until the children have been written out before
* the update to the parent can be written - this is a list of the * the update to the parent can be written - this is a list of the
* btree_interior_updates that are blocking this node from being * btree_updates that are blocking this node from being
* written: * written:
*/ */
struct list_head write_blocked; struct list_head write_blocked;
@ -116,7 +116,7 @@ struct btree {
* another write - because that write also won't yet be reachable and * another write - because that write also won't yet be reachable and
* marking it as completed before it's reachable would be incorrect: * marking it as completed before it's reachable would be incorrect:
*/ */
struct btree_interior_update *will_make_reachable; struct btree_update *will_make_reachable;
struct open_bucket *ob; struct open_bucket *ob;
@ -265,7 +265,7 @@ static inline bool btree_node_is_extents(struct btree *b)
struct btree_root { struct btree_root {
struct btree *b; struct btree *b;
struct btree_interior_update *as; struct btree_update *as;
/* On disk root - see async splits: */ /* On disk root - see async splits: */
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
@ -312,6 +312,11 @@ enum btree_gc_coalesce_fail_reason {
BTREE_GC_COALESCE_FAIL_FORMAT_FITS, BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
}; };
enum btree_node_sibling {
btree_prev_sib,
btree_next_sib,
};
typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
struct btree *, struct btree *,
struct btree_node_iter *); struct btree_node_iter *);

View File

@ -1,310 +1,24 @@
#ifndef _BCACHE_BTREE_INSERT_H #ifndef _BCACHE_BTREE_UPDATE_H
#define _BCACHE_BTREE_INSERT_H #define _BCACHE_BTREE_UPDATE_H
#include "btree_cache.h"
#include "btree_iter.h" #include "btree_iter.h"
#include "buckets.h"
#include "journal.h" #include "journal.h"
#include "vstructs.h"
struct bch_fs; struct bch_fs;
struct bkey_format_state;
struct bkey_format;
struct btree; struct btree;
struct btree_insert;
static inline void btree_node_reset_sib_u64s(struct btree *b) void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
{ struct btree_iter *);
b->sib_u64s[0] = b->nr.live_u64s;
b->sib_u64s[1] = b->nr.live_u64s;
}
struct btree_reserve {
struct disk_reservation disk_res;
unsigned nr;
struct btree *b[BTREE_RESERVE_MAX];
};
void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *);
bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
struct bkey_format *);
/* Btree node freeing/allocation: */
/*
* Tracks a btree node that has been (or is about to be) freed in memory, but
* has _not_ yet been freed on disk (because the write that makes the new
* node(s) visible and frees the old hasn't completed yet)
*/
struct pending_btree_node_free {
bool index_update_done;
__le64 seq;
enum btree_id btree_id;
unsigned level;
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
};
/*
* Tracks an in progress split/rewrite of a btree node and the update to the
* parent node:
*
* When we split/rewrite a node, we do all the updates in memory without
* waiting for any writes to complete - we allocate the new node(s) and update
* the parent node, possibly recursively up to the root.
*
* The end result is that we have one or more new nodes being written -
* possibly several, if there were multiple splits - and then a write (updating
* an interior node) which will make all these new nodes visible.
*
* Additionally, as we split/rewrite nodes we free the old nodes - but the old
* nodes can't be freed (their space on disk can't be reclaimed) until the
* update to the interior node that makes the new node visible completes -
* until then, the old nodes are still reachable on disk.
*
*/
struct btree_interior_update {
struct closure cl;
struct bch_fs *c;
struct list_head list;
/* What kind of update are we doing? */
enum {
BTREE_INTERIOR_NO_UPDATE,
BTREE_INTERIOR_UPDATING_NODE,
BTREE_INTERIOR_UPDATING_ROOT,
BTREE_INTERIOR_UPDATING_AS,
} mode;
unsigned flags;
struct btree_reserve *reserve;
/*
* BTREE_INTERIOR_UPDATING_NODE:
* The update that made the new nodes visible was a regular update to an
* existing interior node - @b. We can't write out the update to @b
* until the new nodes we created are finished writing, so we block @b
* from writing by putting this btree_interior update on the
* @b->write_blocked list with @write_blocked_list:
*/
struct btree *b;
struct list_head write_blocked_list;
/*
* BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
* we're now blocking another btree_interior_update
* @parent_as - btree_interior_update that's waiting on our nodes to finish
* writing, before it can make new nodes visible on disk
* @wait - list of child btree_interior_updates that are waiting on this
* btree_interior_update to make all the new nodes visible before they can free
* their old btree nodes
*/
struct btree_interior_update *parent_as;
struct closure_waitlist wait;
/*
* We may be freeing nodes that were dirty, and thus had journal entries
* pinned: we need to transfer the oldest of those pins to the
* btree_interior_update operation, and release it when the new node(s)
* are all persistent and reachable:
*/
struct journal_entry_pin journal;
u64 journal_seq;
/*
* Nodes being freed:
* Protected by c->btree_node_pending_free_lock
*/
struct pending_btree_node_free pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
unsigned nr_pending;
/* New nodes, that will be made reachable by this update: */
struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
unsigned nr_new_nodes;
/* Only here to reduce stack usage on recursive splits: */
struct keylist parent_keys;
/*
* Enough room for btree_split's keys without realloc - btree node
* pointers never have crc/compression info, so we only need to acount
* for the pointers for three keys
*/
u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
};
#define BTREE_INTERIOR_UPDATE_MUST_REWRITE (1 << 0)
#define for_each_pending_btree_node_free(c, as, p) \
list_for_each_entry(as, &c->btree_interior_update_list, list) \
for (p = as->pending; p < as->pending + as->nr_pending; p++)
void bch2_btree_node_free_inmem(struct btree_iter *, struct btree *);
void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *);
struct btree *__bch2_btree_node_alloc_replacement(struct bch_fs *,
struct btree *,
struct bkey_format,
struct btree_interior_update *,
struct btree_reserve *);
struct btree_interior_update *
bch2_btree_interior_update_alloc(struct bch_fs *);
void bch2_btree_interior_update_will_free_node(struct bch_fs *,
struct btree_interior_update *,
struct btree *);
void bch2_btree_set_root_initial(struct bch_fs *, struct btree *,
struct btree_reserve *);
void bch2_btree_reserve_put(struct bch_fs *, struct btree_reserve *);
struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *,
struct btree *, unsigned,
unsigned, struct closure *);
int bch2_btree_root_alloc(struct bch_fs *, enum btree_id, struct closure *);
/* Inserting into a given leaf node (last stage of insert): */
bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
struct btree_node_iter *, struct bkey_i *); struct btree_node_iter *, struct bkey_i *);
void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *, void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *,
struct bkey_i *); struct bkey_i *);
static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
{
return (void *) b->data + btree_bytes(c);
}
static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
struct btree *b)
{
return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
}
static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
struct btree *b)
{
return btree_data_end(c, b);
}
static inline void *write_block(struct btree *b)
{
return (void *) b->data + (b->written << 9);
}
static inline bool bset_written(struct btree *b, struct bset *i)
{
return (void *) i < write_block(b);
}
static inline bool bset_unwritten(struct btree *b, struct bset *i)
{
return (void *) i > write_block(b);
}
static inline unsigned bset_end_sector(struct bch_fs *c, struct btree *b,
struct bset *i)
{
return round_up(bset_byte_offset(b, vstruct_end(i)),
block_bytes(c)) >> 9;
}
static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
struct btree *b)
{
struct bset *i = btree_bset_last(b);
unsigned used = bset_byte_offset(b, vstruct_end(i)) / sizeof(u64) +
b->whiteout_u64s +
b->uncompacted_whiteout_u64s;
unsigned total = c->sb.btree_node_size << 6;
EBUG_ON(used > total);
if (bset_written(b, i))
return 0;
return total - used;
}
static inline unsigned btree_write_set_buffer(struct btree *b)
{
/*
* Could buffer up larger amounts of keys for btrees with larger keys,
* pending benchmarking:
*/
return 4 << 10;
}
static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
struct btree *b)
{
struct bset *i = btree_bset_last(b);
unsigned offset = max_t(unsigned, b->written << 9,
bset_byte_offset(b, vstruct_end(i)));
ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t)
(offset + sizeof(struct btree_node_entry) +
b->whiteout_u64s * sizeof(u64) +
b->uncompacted_whiteout_u64s * sizeof(u64));
EBUG_ON(offset > btree_bytes(c));
if ((unlikely(bset_written(b, i)) && n > 0) ||
(unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
n > btree_write_set_buffer(b)))
return (void *) b->data + offset;
return NULL;
}
/*
* write lock must be held on @b (else the dirty bset that we were going to
* insert into could be written out from under us)
*/
static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
struct btree *b, unsigned u64s)
{
if (btree_node_is_extents(b)) {
/* The insert key might split an existing key
* (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case:
*/
u64s += BKEY_EXTENT_U64s_MAX;
}
return u64s <= bch_btree_keys_u64s_remaining(c, b);
}
static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t,
struct bkey_packed *k)
{
if (bset_written(b, bset(b, t))) {
EBUG_ON(b->uncompacted_whiteout_u64s <
bkeyp_key_u64s(&b->format, k));
b->uncompacted_whiteout_u64s -=
bkeyp_key_u64s(&b->format, k);
}
}
static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
struct bkey_packed *k)
{
if (bset_written(b, bset(b, t))) {
BUG_ON(!k->needs_whiteout);
b->uncompacted_whiteout_u64s +=
bkeyp_key_u64s(&b->format, k);
}
}
void bch2_btree_insert_node(struct btree *, struct btree_iter *,
struct keylist *, struct btree_reserve *,
struct btree_interior_update *as);
/* Normal update interface: */ /* Normal update interface: */
struct btree_insert { struct btree_insert {
struct bch_fs *c; struct bch_fs *c;
struct disk_reservation *disk_res; struct disk_reservation *disk_res;
struct journal_res journal_res; struct journal_res journal_res;
u64 *journal_seq; u64 *journal_seq;
@ -403,25 +117,6 @@ int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *,
struct disk_reservation *, struct disk_reservation *,
struct extent_insert_hook *, u64 *, unsigned); struct extent_insert_hook *, u64 *, unsigned);
static inline bool journal_res_insert_fits(struct btree_insert *trans,
struct btree_insert_entry *insert)
{
unsigned u64s = 0;
struct btree_insert_entry *i;
/*
* If we didn't get a journal reservation, we're in journal replay and
* we're not journalling updates:
*/
if (!trans->journal_res.ref)
return true;
for (i = insert; i < trans->entries + trans->nr; i++)
u64s += jset_u64s(i->k->k.u64s + i->extra_res);
return u64s <= trans->journal_res.u64s;
}
int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
struct disk_reservation *, struct disk_reservation *,
struct extent_insert_hook *, u64 *, int flags); struct extent_insert_hook *, u64 *, int flags);
@ -438,5 +133,5 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
int bch2_btree_node_update_key(struct bch_fs *, struct btree *, int bch2_btree_node_update_key(struct bch_fs *, struct btree *,
struct bkey_i_extent *); struct bkey_i_extent *);
#endif /* _BCACHE_BTREE_INSERT_H */ #endif /* _BCACHE_BTREE_UPDATE_H */

View File

@ -0,0 +1,312 @@
#ifndef _BCACHE_BTREE_UPDATE_INTERIOR_H
#define _BCACHE_BTREE_UPDATE_INTERIOR_H
#include "btree_cache.h"
#include "btree_update.h"
struct btree_reserve {
struct disk_reservation disk_res;
unsigned nr;
struct btree *b[BTREE_RESERVE_MAX];
};
void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *);
bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
struct bkey_format *);
/* Btree node freeing/allocation: */
/*
* Tracks a btree node that has been (or is about to be) freed in memory, but
* has _not_ yet been freed on disk (because the write that makes the new
* node(s) visible and frees the old hasn't completed yet)
*/
struct pending_btree_node_free {
bool index_update_done;
__le64 seq;
enum btree_id btree_id;
unsigned level;
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
};
/*
* Tracks an in progress split/rewrite of a btree node and the update to the
* parent node:
*
* When we split/rewrite a node, we do all the updates in memory without
* waiting for any writes to complete - we allocate the new node(s) and update
* the parent node, possibly recursively up to the root.
*
* The end result is that we have one or more new nodes being written -
* possibly several, if there were multiple splits - and then a write (updating
* an interior node) which will make all these new nodes visible.
*
* Additionally, as we split/rewrite nodes we free the old nodes - but the old
* nodes can't be freed (their space on disk can't be reclaimed) until the
* update to the interior node that makes the new node visible completes -
* until then, the old nodes are still reachable on disk.
*
*/
struct btree_update {
struct closure cl;
struct bch_fs *c;
struct list_head list;
/* What kind of update are we doing? */
enum {
BTREE_INTERIOR_NO_UPDATE,
BTREE_INTERIOR_UPDATING_NODE,
BTREE_INTERIOR_UPDATING_ROOT,
BTREE_INTERIOR_UPDATING_AS,
} mode;
enum btree_id btree_id;
unsigned flags;
struct btree_reserve *reserve;
/*
* BTREE_INTERIOR_UPDATING_NODE:
* The update that made the new nodes visible was a regular update to an
* existing interior node - @b. We can't write out the update to @b
* until the new nodes we created are finished writing, so we block @b
* from writing by putting this btree_interior update on the
* @b->write_blocked list with @write_blocked_list:
*/
struct btree *b;
struct list_head write_blocked_list;
/*
* BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
* we're now blocking another btree_update
* @parent_as - btree_update that's waiting on our nodes to finish
* writing, before it can make new nodes visible on disk
* @wait - list of child btree_updates that are waiting on this
* btree_update to make all the new nodes visible before they can free
* their old btree nodes
*/
struct btree_update *parent_as;
struct closure_waitlist wait;
/*
* We may be freeing nodes that were dirty, and thus had journal entries
* pinned: we need to transfer the oldest of those pins to the
* btree_update operation, and release it when the new node(s)
* are all persistent and reachable:
*/
struct journal_entry_pin journal;
u64 journal_seq;
/*
* Nodes being freed:
* Protected by c->btree_node_pending_free_lock
*/
struct pending_btree_node_free pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
unsigned nr_pending;
/* New nodes, that will be made reachable by this update: */
struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
unsigned nr_new_nodes;
/* Only here to reduce stack usage on recursive splits: */
struct keylist parent_keys;
/*
* Enough room for btree_split's keys without realloc - btree node
* pointers never have crc/compression info, so we only need to acount
* for the pointers for three keys
*/
u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
};
#define BTREE_INTERIOR_UPDATE_MUST_REWRITE (1 << 0)
#define for_each_pending_btree_node_free(c, as, p) \
list_for_each_entry(as, &c->btree_interior_update_list, list) \
for (p = as->pending; p < as->pending + as->nr_pending; p++)
void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
struct btree_iter *);
void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
void bch2_btree_open_bucket_put(struct bch_fs *, struct btree *);
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
struct btree *,
struct bkey_format);
void bch2_btree_update_done(struct btree_update *);
struct btree_update *
bch2_btree_update_start(struct bch_fs *, enum btree_id, unsigned,
unsigned, struct closure *);
void bch2_btree_interior_update_will_free_node(struct btree_update *,
struct btree *);
void bch2_btree_insert_node(struct btree_update *, struct btree *,
struct btree_iter *, struct keylist *);
int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
int bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
enum btree_node_sibling);
void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
int bch2_btree_root_alloc(struct bch_fs *, enum btree_id, struct closure *);
static inline unsigned btree_update_reserve_required(struct bch_fs *c,
struct btree *b)
{
unsigned depth = btree_node_root(c, b)->level - b->level;
return btree_reserve_required_nodes(depth);
}
static inline void btree_node_reset_sib_u64s(struct btree *b)
{
b->sib_u64s[0] = b->nr.live_u64s;
b->sib_u64s[1] = b->nr.live_u64s;
}
static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
{
return (void *) b->data + btree_bytes(c);
}
static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
struct btree *b)
{
return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
}
static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
struct btree *b)
{
return btree_data_end(c, b);
}
static inline void *write_block(struct btree *b)
{
return (void *) b->data + (b->written << 9);
}
static inline bool bset_written(struct btree *b, struct bset *i)
{
return (void *) i < write_block(b);
}
static inline bool bset_unwritten(struct btree *b, struct bset *i)
{
return (void *) i > write_block(b);
}
static inline unsigned bset_end_sector(struct bch_fs *c, struct btree *b,
struct bset *i)
{
return round_up(bset_byte_offset(b, vstruct_end(i)),
block_bytes(c)) >> 9;
}
static inline unsigned btree_write_set_buffer(struct btree *b)
{
/*
* Could buffer up larger amounts of keys for btrees with larger keys,
* pending benchmarking:
*/
return 4 << 10;
}
static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
struct btree *b)
{
struct bset *i = btree_bset_last(b);
unsigned offset = max_t(unsigned, b->written << 9,
bset_byte_offset(b, vstruct_end(i)));
ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t)
(offset + sizeof(struct btree_node_entry) +
b->whiteout_u64s * sizeof(u64) +
b->uncompacted_whiteout_u64s * sizeof(u64));
EBUG_ON(offset > btree_bytes(c));
if ((unlikely(bset_written(b, i)) && n > 0) ||
(unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
n > btree_write_set_buffer(b)))
return (void *) b->data + offset;
return NULL;
}
static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t,
struct bkey_packed *k)
{
if (bset_written(b, bset(b, t))) {
EBUG_ON(b->uncompacted_whiteout_u64s <
bkeyp_key_u64s(&b->format, k));
b->uncompacted_whiteout_u64s -=
bkeyp_key_u64s(&b->format, k);
}
}
static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
struct bkey_packed *k)
{
if (bset_written(b, bset(b, t))) {
BUG_ON(!k->needs_whiteout);
b->uncompacted_whiteout_u64s +=
bkeyp_key_u64s(&b->format, k);
}
}
static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
struct btree *b)
{
struct bset *i = btree_bset_last(b);
unsigned used = bset_byte_offset(b, vstruct_end(i)) / sizeof(u64) +
b->whiteout_u64s +
b->uncompacted_whiteout_u64s;
unsigned total = c->sb.btree_node_size << 6;
EBUG_ON(used > total);
if (bset_written(b, i))
return 0;
return total - used;
}
/*
* write lock must be held on @b (else the dirty bset that we were going to
* insert into could be written out from under us)
*/
static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
struct btree *b, unsigned u64s)
{
if (btree_node_is_extents(b)) {
/* The insert key might split an existing key
* (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case:
*/
u64s += BKEY_EXTENT_U64s_MAX;
}
return u64s <= bch_btree_keys_u64s_remaining(c, b);
}
static inline bool journal_res_insert_fits(struct btree_insert *trans,
struct btree_insert_entry *insert)
{
unsigned u64s = 0;
struct btree_insert_entry *i;
/*
* If we didn't get a journal reservation, we're in journal replay and
* we're not journalling updates:
*/
if (!trans->journal_res.ref)
return true;
for (i = insert; i < trans->entries + trans->nr; i++)
u64s += jset_u64s(i->k->k.u64s + i->extra_res);
return u64s <= trans->journal_res.u64s;
}
#endif /* _BCACHE_BTREE_UPDATE_INTERIOR_H */

View File

@ -0,0 +1,660 @@
#include "bcachefs.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "btree_iter.h"
#include "btree_locking.h"
#include "debug.h"
#include "extents.h"
#include "journal.h"
#include "keylist.h"
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
/* Inserting into a given leaf node (last stage of insert): */
/* Handle overwrites and do insert, for non extents: */
bool bch2_btree_bset_insert_key(struct btree_iter *iter,
struct btree *b,
struct btree_node_iter *node_iter,
struct bkey_i *insert)
{
const struct bkey_format *f = &b->format;
struct bkey_packed *k;
struct bset_tree *t;
unsigned clobber_u64s;
EBUG_ON(btree_node_just_written(b));
EBUG_ON(bset_written(b, btree_bset_last(b)));
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
bkey_cmp(insert->k.p, b->data->max_key) > 0);
k = bch2_btree_node_iter_peek_all(node_iter, b);
if (k && !bkey_cmp_packed(b, k, &insert->k)) {
BUG_ON(bkey_whiteout(k));
t = bch2_bkey_to_bset(b, k);
if (bset_unwritten(b, bset(b, t)) &&
bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) {
BUG_ON(bkey_whiteout(k) != bkey_whiteout(&insert->k));
k->type = insert->k.type;
memcpy_u64s(bkeyp_val(f, k), &insert->v,
bkey_val_u64s(&insert->k));
return true;
}
insert->k.needs_whiteout = k->needs_whiteout;
btree_keys_account_key_drop(&b->nr, t - b->set, k);
if (t == bset_tree_last(b)) {
clobber_u64s = k->u64s;
/*
* If we're deleting, and the key we're deleting doesn't
* need a whiteout (it wasn't overwriting a key that had
* been written to disk) - just delete it:
*/
if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
bch2_bset_delete(b, k, clobber_u64s);
bch2_btree_node_iter_fix(iter, b, node_iter, t,
k, clobber_u64s, 0);
return true;
}
goto overwrite;
}
k->type = KEY_TYPE_DELETED;
bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
k->u64s, k->u64s);
if (bkey_whiteout(&insert->k)) {
reserve_whiteout(b, t, k);
return true;
} else {
k->needs_whiteout = false;
}
} else {
/*
* Deleting, but the key to delete wasn't found - nothing to do:
*/
if (bkey_whiteout(&insert->k))
return false;
insert->k.needs_whiteout = false;
}
t = bset_tree_last(b);
k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
clobber_u64s = 0;
overwrite:
bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k))
bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
clobber_u64s, k->u64s);
return true;
}
static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
unsigned i, u64 seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct btree_write *w = container_of(pin, struct btree_write, journal);
struct btree *b = container_of(w, struct btree, writes[i]);
six_lock_read(&b->lock);
bch2_btree_node_write_dirty(c, b, NULL,
(btree_current_write(b) == w &&
w->journal.pin_list == journal_seq_pin(j, seq)));
six_unlock_read(&b->lock);
}
static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
{
return __btree_node_flush(j, pin, 0, seq);
}
static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
{
return __btree_node_flush(j, pin, 1, seq);
}
void bch2_btree_journal_key(struct btree_insert *trans,
struct btree_iter *iter,
struct bkey_i *insert)
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree *b = iter->nodes[0];
struct btree_write *w = btree_current_write(b);
EBUG_ON(iter->level || b->level);
EBUG_ON(trans->journal_res.ref !=
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
if (!journal_pin_active(&w->journal))
bch2_journal_pin_add(j, &trans->journal_res,
&w->journal,
btree_node_write_idx(b) == 0
? btree_node_flush0
: btree_node_flush1);
if (trans->journal_res.ref) {
u64 seq = trans->journal_res.seq;
bool needs_whiteout = insert->k.needs_whiteout;
/* ick */
insert->k.needs_whiteout = false;
bch2_journal_add_keys(j, &trans->journal_res,
b->btree_id, insert);
insert->k.needs_whiteout = needs_whiteout;
bch2_journal_set_has_inode(j, &trans->journal_res,
insert->k.p.inode);
if (trans->journal_seq)
*trans->journal_seq = seq;
btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
}
if (!btree_node_dirty(b))
set_btree_node_dirty(b);
}
static enum btree_insert_ret
bch2_insert_fixup_key(struct btree_insert *trans,
struct btree_insert_entry *insert)
{
struct btree_iter *iter = insert->iter;
BUG_ON(iter->level);
BUG_ON(insert->k->k.u64s >
bch_btree_keys_u64s_remaining(trans->c, iter->nodes[0]));
if (bch2_btree_bset_insert_key(iter, iter->nodes[0],
&iter->node_iters[0],
insert->k))
bch2_btree_journal_key(trans, iter, insert->k);
trans->did_work = true;
return BTREE_INSERT_OK;
}
static int inline foreground_maybe_merge(struct bch_fs *c,
struct btree_iter *iter,
enum btree_node_sibling sib)
{
struct btree *b;
if (!btree_node_locked(iter, iter->level))
return 0;
b = iter->nodes[iter->level];
if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
return 0;
return bch2_foreground_maybe_merge(c, iter, sib);
}
/**
* btree_insert_key - insert a key one key into a leaf node
*/
static enum btree_insert_ret
btree_insert_key(struct btree_insert *trans,
struct btree_insert_entry *insert)
{
struct bch_fs *c = trans->c;
struct btree_iter *iter = insert->iter;
struct btree *b = iter->nodes[0];
enum btree_insert_ret ret;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
iter->flags &= ~BTREE_ITER_UPTODATE;
ret = !btree_node_is_extents(b)
? bch2_insert_fixup_key(trans, insert)
: bch2_insert_fixup_extent(trans, insert);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
if (u64s_added > live_u64s_added &&
bch2_maybe_compact_whiteouts(c, b))
bch2_btree_iter_reinit_node(iter, b);
trace_btree_insert_key(c, b, insert->k);
return ret;
}
static bool same_leaf_as_prev(struct btree_insert *trans,
struct btree_insert_entry *i)
{
/*
* Because we sorted the transaction entries, if multiple iterators
* point to the same leaf node they'll always be adjacent now:
*/
return i != trans->entries &&
i[0].iter->nodes[0] == i[-1].iter->nodes[0];
}
#define trans_for_each_entry(trans, i) \
for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
struct btree_iter *iter)
{
bch2_btree_node_lock_write(b, iter);
if (btree_node_just_written(b) &&
bch2_btree_post_write_cleanup(c, b))
bch2_btree_iter_reinit_node(iter, b);
/*
* If the last bset has been written, or if it's gotten too big - start
* a new bset to insert into:
*/
if (want_new_bset(c, b))
bch2_btree_init_next(c, b, iter);
}
static void multi_lock_write(struct bch_fs *c, struct btree_insert *trans)
{
struct btree_insert_entry *i;
trans_for_each_entry(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_lock_for_insert(c, i->iter->nodes[0], i->iter);
}
static void multi_unlock_write(struct btree_insert *trans)
{
struct btree_insert_entry *i;
trans_for_each_entry(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_unlock_write(i->iter->nodes[0], i->iter);
}
static int btree_trans_entry_cmp(const void *_l, const void *_r)
{
const struct btree_insert_entry *l = _l;
const struct btree_insert_entry *r = _r;
return btree_iter_cmp(l->iter, r->iter);
}
/* Normal update interface: */
/**
* __bch_btree_insert_at - insert keys at given iterator positions
*
* This is main entry point for btree updates.
*
* Return values:
* -EINTR: locking changed, this function should be called again. Only returned
* if passed BTREE_INSERT_ATOMIC.
* -EROFS: filesystem read only
* -EIO: journal or btree node IO error
*/
int __bch2_btree_insert_at(struct btree_insert *trans)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
struct btree_iter *split = NULL;
bool cycle_gc_lock = false;
unsigned u64s;
int ret;
trans_for_each_entry(trans, i) {
BUG_ON(i->iter->level);
BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
BUG_ON(debug_check_bkeys(c) &&
bch2_bkey_invalid(c, i->iter->btree_id,
bkey_i_to_s_c(i->k)));
}
sort(trans->entries, trans->nr, sizeof(trans->entries[0]),
btree_trans_entry_cmp, NULL);
if (unlikely(!percpu_ref_tryget(&c->writes)))
return -EROFS;
retry_locks:
ret = -EINTR;
trans_for_each_entry(trans, i)
if (!bch2_btree_iter_set_locks_want(i->iter, 1))
goto err;
retry:
trans->did_work = false;
u64s = 0;
trans_for_each_entry(trans, i)
if (!i->done)
u64s += jset_u64s(i->k->k.u64s + i->extra_res);
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
? bch2_journal_res_get(&c->journal,
&trans->journal_res,
u64s, u64s)
: 0;
if (ret)
goto err;
multi_lock_write(c, trans);
u64s = 0;
trans_for_each_entry(trans, i) {
/* Multiple inserts might go to same leaf: */
if (!same_leaf_as_prev(trans, i))
u64s = 0;
/*
* bch2_btree_node_insert_fits() must be called under write lock:
* with only an intent lock, another thread can still call
* bch2_btree_node_write(), converting an unwritten bset to a
* written one
*/
if (!i->done) {
u64s += i->k->k.u64s + i->extra_res;
if (!bch2_btree_node_insert_fits(c,
i->iter->nodes[0], u64s)) {
split = i->iter;
goto unlock;
}
}
}
ret = 0;
split = NULL;
cycle_gc_lock = false;
trans_for_each_entry(trans, i) {
if (i->done)
continue;
switch (btree_insert_key(trans, i)) {
case BTREE_INSERT_OK:
i->done = true;
break;
case BTREE_INSERT_JOURNAL_RES_FULL:
case BTREE_INSERT_NEED_TRAVERSE:
ret = -EINTR;
break;
case BTREE_INSERT_NEED_RESCHED:
ret = -EAGAIN;
break;
case BTREE_INSERT_BTREE_NODE_FULL:
split = i->iter;
break;
case BTREE_INSERT_ENOSPC:
ret = -ENOSPC;
break;
case BTREE_INSERT_NEED_GC_LOCK:
cycle_gc_lock = true;
ret = -EINTR;
break;
default:
BUG();
}
if (!trans->did_work && (ret || split))
break;
}
unlock:
multi_unlock_write(trans);
bch2_journal_res_put(&c->journal, &trans->journal_res);
if (split)
goto split;
if (ret)
goto err;
/*
* hack: iterators are inconsistent when they hit end of leaf, until
* traversed again
*/
trans_for_each_entry(trans, i)
if (i->iter->flags & BTREE_ITER_AT_END_OF_LEAF)
goto out;
trans_for_each_entry(trans, i)
if (!same_leaf_as_prev(trans, i)) {
foreground_maybe_merge(c, i->iter, btree_prev_sib);
foreground_maybe_merge(c, i->iter, btree_next_sib);
}
out:
/* make sure we didn't lose an error: */
if (!ret && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
trans_for_each_entry(trans, i)
BUG_ON(!i->done);
percpu_ref_put(&c->writes);
return ret;
split:
/*
* have to drop journal res before splitting, because splitting means
* allocating new btree nodes, and holding a journal reservation
* potentially blocks the allocator:
*/
ret = bch2_btree_split_leaf(c, split, trans->flags);
if (ret)
goto err;
/*
* if the split didn't have to drop locks the insert will still be
* atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked()
* and is overwriting won't have changed)
*/
goto retry_locks;
err:
if (cycle_gc_lock) {
down_read(&c->gc_lock);
up_read(&c->gc_lock);
}
if (ret == -EINTR) {
trans_for_each_entry(trans, i) {
int ret2 = bch2_btree_iter_traverse(i->iter);
if (ret2) {
ret = ret2;
goto out;
}
}
/*
* BTREE_ITER_ATOMIC means we have to return -EINTR if we
* dropped locks:
*/
if (!(trans->flags & BTREE_INSERT_ATOMIC))
goto retry;
}
goto out;
}
int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags)
{
struct bkey_i k;
bkey_init(&k.k);
k.k.p = iter->pos;
return bch2_btree_insert_at(iter->c, NULL, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|flags,
BTREE_INSERT_ENTRY(iter, &k));
}
int bch2_btree_insert_list_at(struct btree_iter *iter,
struct keylist *keys,
struct disk_reservation *disk_res,
struct extent_insert_hook *hook,
u64 *journal_seq, unsigned flags)
{
BUG_ON(flags & BTREE_INSERT_ATOMIC);
BUG_ON(bch2_keylist_empty(keys));
bch2_verify_keylist_sorted(keys);
while (!bch2_keylist_empty(keys)) {
/* need to traverse between each insert */
int ret = bch2_btree_iter_traverse(iter);
if (ret)
return ret;
ret = bch2_btree_insert_at(iter->c, disk_res, hook,
journal_seq, flags,
BTREE_INSERT_ENTRY(iter, bch2_keylist_front(keys)));
if (ret)
return ret;
bch2_keylist_pop_front(keys);
}
return 0;
}
/**
* bch_btree_insert - insert keys into the extent btree
* @c: pointer to struct bch_fs
* @id: btree to insert into
* @insert_keys: list of keys to insert
* @hook: insert callback
*/
int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
struct bkey_i *k,
struct disk_reservation *disk_res,
struct extent_insert_hook *hook,
u64 *journal_seq, int flags)
{
struct btree_iter iter;
int ret, ret2;
bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k),
BTREE_ITER_INTENT);
ret = bch2_btree_iter_traverse(&iter);
if (unlikely(ret))
goto out;
ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags,
BTREE_INSERT_ENTRY(&iter, k));
out: ret2 = bch2_btree_iter_unlock(&iter);
return ret ?: ret2;
}
/**
* bch_btree_update - like bch2_btree_insert(), but asserts that we're
* overwriting an existing key
*/
int bch2_btree_update(struct bch_fs *c, enum btree_id id,
struct bkey_i *k, u64 *journal_seq)
{
struct btree_iter iter;
struct bkey_s_c u;
int ret;
EBUG_ON(id == BTREE_ID_EXTENTS);
bch2_btree_iter_init(&iter, c, id, k->k.p,
BTREE_ITER_INTENT);
u = bch2_btree_iter_peek_with_holes(&iter);
ret = btree_iter_err(u);
if (ret)
return ret;
if (bkey_deleted(u.k)) {
bch2_btree_iter_unlock(&iter);
return -ENOENT;
}
ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, 0,
BTREE_INSERT_ENTRY(&iter, k));
bch2_btree_iter_unlock(&iter);
return ret;
}
/*
* bch_btree_delete_range - delete everything within a given range
*
* Range is a half open interval - [start, end)
*/
int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
struct bpos start,
struct bpos end,
struct bversion version,
struct disk_reservation *disk_res,
struct extent_insert_hook *hook,
u64 *journal_seq)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
bch2_btree_iter_init(&iter, c, id, start,
BTREE_ITER_INTENT);
while ((k = bch2_btree_iter_peek(&iter)).k &&
!(ret = btree_iter_err(k))) {
unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
/* really shouldn't be using a bare, unpadded bkey_i */
struct bkey_i delete;
if (bkey_cmp(iter.pos, end) >= 0)
break;
bkey_init(&delete.k);
/*
* For extents, iter.pos won't necessarily be the same as
* bkey_start_pos(k.k) (for non extents they always will be the
* same). It's important that we delete starting from iter.pos
* because the range we want to delete could start in the middle
* of k.
*
* (bch2_btree_iter_peek() does guarantee that iter.pos >=
* bkey_start_pos(k.k)).
*/
delete.k.p = iter.pos;
delete.k.version = version;
if (iter.flags & BTREE_ITER_IS_EXTENTS) {
/*
* The extents btree is special - KEY_TYPE_DISCARD is
* used for deletions, not KEY_TYPE_DELETED. This is an
* internal implementation detail that probably
* shouldn't be exposed (internally, KEY_TYPE_DELETED is
* used as a proxy for k->size == 0):
*/
delete.k.type = KEY_TYPE_DISCARD;
/* create the biggest key we can */
bch2_key_resize(&delete.k, max_sectors);
bch2_cut_back(end, &delete.k);
}
ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq,
BTREE_INSERT_NOFAIL,
BTREE_INSERT_ENTRY(&iter, &delete));
if (ret)
break;
bch2_btree_iter_cond_resched(&iter);
}
bch2_btree_iter_unlock(&iter);
return ret;
}

View File

@ -80,21 +80,25 @@ static void bch2_fs_stats_verify(struct bch_fs *c)
{ {
struct bch_fs_usage stats = struct bch_fs_usage stats =
__bch2_fs_usage_read(c); __bch2_fs_usage_read(c);
unsigned i;
if ((s64) stats.sectors_dirty < 0) for (i = 0; i < BCH_REPLICAS_MAX; i++) {
panic("sectors_dirty underflow: %lli\n", stats.sectors_dirty); if ((s64) stats.s[i].data[S_META] < 0)
panic("replicas %u meta underflow: %lli\n",
i + 1, stats.s[i].data[S_META]);
if ((s64) stats.sectors_cached < 0) if ((s64) stats.s[i].data[S_DIRTY] < 0)
panic("sectors_cached underflow: %lli\n", stats.sectors_cached); panic("replicas %u dirty underflow: %lli\n",
i + 1, stats.s[i].data[S_DIRTY]);
if ((s64) stats.sectors_meta < 0) if ((s64) stats.s[i].persistent_reserved < 0)
panic("sectors_meta underflow: %lli\n", stats.sectors_meta); panic("replicas %u reserved underflow: %lli\n",
i + 1, stats.s[i].persistent_reserved);
}
if ((s64) stats.sectors_persistent_reserved < 0) if ((s64) stats.online_reserved < 0)
panic("sectors_persistent_reserved underflow: %lli\n", stats.sectors_persistent_reserved); panic("sectors_online_reserved underflow: %lli\n",
stats.online_reserved);
if ((s64) stats.sectors_online_reserved < 0)
panic("sectors_online_reserved underflow: %lli\n", stats.sectors_online_reserved);
} }
#else #else
@ -223,11 +227,8 @@ void bch2_fs_usage_apply(struct bch_fs *c,
struct disk_reservation *disk_res, struct disk_reservation *disk_res,
struct gc_pos gc_pos) struct gc_pos gc_pos)
{ {
s64 added = struct fs_usage_sum sum = __fs_usage_sum(*stats);
stats->s[S_COMPRESSED][S_META] + s64 added = sum.data + sum.reserved;
stats->s[S_COMPRESSED][S_DIRTY] +
stats->persistent_reserved +
stats->online_reserved;
/* /*
* Not allowed to reduce sectors_available except by getting a * Not allowed to reduce sectors_available except by getting a
@ -255,19 +256,8 @@ void bch2_fs_usage_apply(struct bch_fs *c,
memset(stats, 0, sizeof(*stats)); memset(stats, 0, sizeof(*stats));
} }
static void bch2_fs_usage_update(struct bch_fs_usage *fs_usage,
struct bucket_mark old, struct bucket_mark new)
{
fs_usage->s[S_COMPRESSED][S_CACHED] +=
(int) new.cached_sectors - (int) old.cached_sectors;
fs_usage->s[S_COMPRESSED][bucket_type(old)] -=
old.dirty_sectors;
fs_usage->s[S_COMPRESSED][bucket_type(new)] +=
new.dirty_sectors;
}
static void bch2_dev_usage_update(struct bch_dev *ca, static void bch2_dev_usage_update(struct bch_dev *ca,
struct bucket_mark old, struct bucket_mark new) struct bucket_mark old, struct bucket_mark new)
{ {
struct bch_fs *c = ca->fs; struct bch_fs *c = ca->fs;
struct bch_dev_usage *dev_usage; struct bch_dev_usage *dev_usage;
@ -280,7 +270,7 @@ static void bch2_dev_usage_update(struct bch_dev *ca,
preempt_disable(); preempt_disable();
dev_usage = this_cpu_ptr(ca->usage_percpu); dev_usage = this_cpu_ptr(ca->usage_percpu);
dev_usage->sectors[S_CACHED] += dev_usage->sectors_cached +=
(int) new.cached_sectors - (int) old.cached_sectors; (int) new.cached_sectors - (int) old.cached_sectors;
dev_usage->sectors[bucket_type(old)] -= old.dirty_sectors; dev_usage->sectors[bucket_type(old)] -= old.dirty_sectors;
@ -289,9 +279,9 @@ static void bch2_dev_usage_update(struct bch_dev *ca,
dev_usage->buckets_alloc += dev_usage->buckets_alloc +=
(int) new.owned_by_allocator - (int) old.owned_by_allocator; (int) new.owned_by_allocator - (int) old.owned_by_allocator;
dev_usage->buckets_meta += is_meta_bucket(new) - is_meta_bucket(old); dev_usage->buckets[S_META] += is_meta_bucket(new) - is_meta_bucket(old);
dev_usage->buckets[S_DIRTY] += is_dirty_bucket(new) - is_dirty_bucket(old);
dev_usage->buckets_cached += is_cached_bucket(new) - is_cached_bucket(old); dev_usage->buckets_cached += is_cached_bucket(new) - is_cached_bucket(old);
dev_usage->buckets_dirty += is_dirty_bucket(new) - is_dirty_bucket(old);
preempt_enable(); preempt_enable();
if (!is_available_bucket(old) && is_available_bucket(new)) if (!is_available_bucket(old) && is_available_bucket(new))
@ -309,7 +299,6 @@ static void bch2_dev_usage_update(struct bch_dev *ca,
bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g, bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
struct bucket_mark *old) struct bucket_mark *old)
{ {
struct bch_fs_usage stats = { 0 };
struct bucket_mark new; struct bucket_mark new;
*old = bucket_data_cmpxchg(ca, g, new, ({ *old = bucket_data_cmpxchg(ca, g, new, ({
@ -324,12 +313,8 @@ bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
new.gen++; new.gen++;
})); }));
/* XXX: we're not actually updating fs usage's cached sectors... */
bch2_fs_usage_update(&stats, *old, new);
if (!old->owned_by_allocator && old->cached_sectors) if (!old->owned_by_allocator && old->cached_sectors)
trace_invalidate(ca, g - ca->buckets, trace_invalidate(ca, g - ca->buckets, old->cached_sectors);
old->cached_sectors);
return true; return true;
} }
@ -367,12 +352,15 @@ void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g)
void bch2_mark_alloc_bucket(struct bch_dev *ca, struct bucket *g, void bch2_mark_alloc_bucket(struct bch_dev *ca, struct bucket *g,
bool owned_by_allocator) bool owned_by_allocator)
{ {
struct bucket_mark new; struct bucket_mark old, new;
bucket_data_cmpxchg(ca, g, new, ({ old = bucket_data_cmpxchg(ca, g, new, ({
new.touched_this_mount = 1; new.touched_this_mount = 1;
new.owned_by_allocator = owned_by_allocator; new.owned_by_allocator = owned_by_allocator;
})); }));
BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
ca->fs->gc_pos.phase == GC_PHASE_DONE);
} }
#define saturated_add(ca, dst, src, max) \ #define saturated_add(ca, dst, src, max) \
@ -414,34 +402,14 @@ void bch2_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g,
bucket_became_unavailable(ca->fs, old, new)); bucket_became_unavailable(ca->fs, old, new));
} }
#if 0
/* Reverting this until the copygc + compression issue is fixed: */ /* Reverting this until the copygc + compression issue is fixed: */
static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors) static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
{ {
return crc_compression_type(crc) return sectors * crc_compressed_size(NULL, crc) /
? sectors * crc_compressed_size(crc) / crc_uncompressed_size(crc) crc_uncompressed_size(NULL, crc);
: sectors;
} }
static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors)
{
return crc_compression_type(crc)
? min_t(unsigned, crc_compressed_size(crc), sectors)
: sectors;
}
#else
static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
{
return sectors;
}
static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors)
{
return sectors;
}
#endif
/* /*
* Checking against gc's position has to be done here, inside the cmpxchg() * Checking against gc's position has to be done here, inside the cmpxchg()
* loop, to avoid racing with the start of gc clearing all the marks - GC does * loop, to avoid racing with the start of gc clearing all the marks - GC does
@ -452,9 +420,8 @@ static void bch2_mark_pointer(struct bch_fs *c,
const union bch_extent_crc *crc, const union bch_extent_crc *crc,
const struct bch_extent_ptr *ptr, const struct bch_extent_ptr *ptr,
s64 sectors, enum s_alloc type, s64 sectors, enum s_alloc type,
bool may_make_unavailable,
struct bch_fs_usage *stats, struct bch_fs_usage *stats,
bool gc_will_visit, u64 journal_seq) u64 journal_seq, unsigned flags)
{ {
struct bucket_mark old, new; struct bucket_mark old, new;
unsigned saturated; unsigned saturated;
@ -462,23 +429,24 @@ static void bch2_mark_pointer(struct bch_fs *c,
struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr); struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr);
unsigned data_type = type == S_META unsigned data_type = type == S_META
? BUCKET_BTREE : BUCKET_DATA; ? BUCKET_BTREE : BUCKET_DATA;
unsigned old_sectors, new_sectors; u64 v;
int disk_sectors, compressed_sectors;
if (sectors > 0) { if (crc_compression_type(crc)) {
old_sectors = 0; unsigned old_sectors, new_sectors;
new_sectors = sectors;
} else { if (sectors > 0) {
old_sectors = e.k->size; old_sectors = 0;
new_sectors = e.k->size + sectors; new_sectors = sectors;
} else {
old_sectors = e.k->size;
new_sectors = e.k->size + sectors;
}
sectors = -__disk_sectors(crc, old_sectors)
+__disk_sectors(crc, new_sectors);
} }
disk_sectors = -__disk_sectors(crc, old_sectors) if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
+ __disk_sectors(crc, new_sectors);
compressed_sectors = -__compressed_sectors(crc, old_sectors)
+ __compressed_sectors(crc, new_sectors);
if (gc_will_visit) {
if (journal_seq) if (journal_seq)
bucket_cmpxchg(g, new, ({ bucket_cmpxchg(g, new, ({
new.touched_this_mount = 1; new.touched_this_mount = 1;
@ -486,10 +454,12 @@ static void bch2_mark_pointer(struct bch_fs *c,
new.journal_seq = journal_seq; new.journal_seq = journal_seq;
})); }));
goto out; return;
} }
old = bucket_data_cmpxchg(ca, g, new, ({ v = READ_ONCE(g->_mark.counter);
do {
new.counter = old.counter = v;
saturated = 0; saturated = 0;
/* /*
@ -498,21 +468,21 @@ static void bch2_mark_pointer(struct bch_fs *c,
* checked the gen * checked the gen
*/ */
if (gen_after(new.gen, ptr->gen)) { if (gen_after(new.gen, ptr->gen)) {
EBUG_ON(type != S_CACHED && EBUG_ON(!ptr->cached &&
test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)); test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
return; return;
} }
if (type != S_CACHED && if (!ptr->cached &&
new.dirty_sectors == GC_MAX_SECTORS_USED && new.dirty_sectors == GC_MAX_SECTORS_USED &&
disk_sectors < 0) sectors < 0)
saturated = -disk_sectors; saturated = -sectors;
if (type == S_CACHED) if (ptr->cached)
saturated_add(ca, new.cached_sectors, disk_sectors, saturated_add(ca, new.cached_sectors, sectors,
GC_MAX_SECTORS_USED); GC_MAX_SECTORS_USED);
else else
saturated_add(ca, new.dirty_sectors, disk_sectors, saturated_add(ca, new.dirty_sectors, sectors,
GC_MAX_SECTORS_USED); GC_MAX_SECTORS_USED);
if (!new.dirty_sectors && if (!new.dirty_sectors &&
@ -528,7 +498,16 @@ static void bch2_mark_pointer(struct bch_fs *c,
} }
new.touched_this_mount = 1; new.touched_this_mount = 1;
}));
if (flags & BCH_BUCKET_MARK_NOATOMIC) {
g->_mark = new;
break;
}
} while ((v = cmpxchg(&g->_mark.counter,
old.counter,
new.counter)) != old.counter);
bch2_dev_usage_update(ca, old, new);
if (old.data_type != data_type && if (old.data_type != data_type &&
(old.data_type || (old.data_type ||
@ -537,7 +516,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
bch_err(ca->fs, "bucket %zu has multiple types of data (%u, %u)", bch_err(ca->fs, "bucket %zu has multiple types of data (%u, %u)",
g - ca->buckets, old.data_type, new.data_type); g - ca->buckets, old.data_type, new.data_type);
BUG_ON(!may_make_unavailable && BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
bucket_became_unavailable(c, old, new)); bucket_became_unavailable(c, old, new));
if (saturated && if (saturated &&
@ -549,66 +528,61 @@ static void bch2_mark_pointer(struct bch_fs *c,
wake_up_process(c->gc_thread); wake_up_process(c->gc_thread);
} }
} }
out:
stats->s[S_COMPRESSED][type] += compressed_sectors;
stats->s[S_UNCOMPRESSED][type] += sectors;
} }
static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c_extent e, static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c_extent e,
s64 sectors, bool metadata, s64 sectors, bool metadata,
bool may_make_unavailable,
struct bch_fs_usage *stats, struct bch_fs_usage *stats,
bool gc_will_visit, u64 journal_seq) u64 journal_seq, unsigned flags)
{ {
const struct bch_extent_ptr *ptr; const struct bch_extent_ptr *ptr;
const union bch_extent_crc *crc; const union bch_extent_crc *crc;
enum s_alloc type = metadata ? S_META : S_DIRTY; enum s_alloc type = metadata ? S_META : S_DIRTY;
unsigned replicas = 0;
BUG_ON(metadata && bkey_extent_is_cached(e.k)); BUG_ON(metadata && bkey_extent_is_cached(e.k));
BUG_ON(!sectors); BUG_ON(!sectors);
extent_for_each_ptr_crc(e, ptr, crc) extent_for_each_ptr_crc(e, ptr, crc) {
bch2_mark_pointer(c, e, crc, ptr, sectors, bch2_mark_pointer(c, e, crc, ptr, sectors, type,
ptr->cached ? S_CACHED : type, stats, journal_seq, flags);
may_make_unavailable, replicas += !ptr->cached;
stats, gc_will_visit, journal_seq); }
BUG_ON(replicas >= BCH_REPLICAS_MAX);
if (replicas)
stats->s[replicas - 1].data[type] += sectors;
} }
static void __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, void __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
s64 sectors, bool metadata, s64 sectors, bool metadata,
bool may_make_unavailable, struct bch_fs_usage *stats,
struct bch_fs_usage *stats, u64 journal_seq, unsigned flags)
bool gc_will_visit, u64 journal_seq)
{ {
switch (k.k->type) { switch (k.k->type) {
case BCH_EXTENT: case BCH_EXTENT:
case BCH_EXTENT_CACHED: case BCH_EXTENT_CACHED:
bch2_mark_extent(c, bkey_s_c_to_extent(k), sectors, metadata, bch2_mark_extent(c, bkey_s_c_to_extent(k), sectors, metadata,
may_make_unavailable, stats, stats, journal_seq, flags);
gc_will_visit, journal_seq);
break; break;
case BCH_RESERVATION: { case BCH_RESERVATION: {
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
stats->persistent_reserved += r.v->nr_replicas * sectors; if (r.v->nr_replicas)
stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
break; break;
} }
} }
} }
void __bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
s64 sectors, bool metadata,
struct bch_fs_usage *stats)
{
__bch2_mark_key(c, k, sectors, metadata, true, stats, false, 0);
}
void bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, void bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
s64 sectors, bool metadata) s64 sectors, bool metadata, unsigned flags)
{ {
struct bch_fs_usage stats = { 0 }; struct bch_fs_usage stats = { 0 };
__bch2_gc_mark_key(c, k, sectors, metadata, &stats); __bch2_mark_key(c, k, sectors, metadata, &stats, 0,
flags|BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
preempt_disable(); preempt_disable();
bch2_usage_add(this_cpu_ptr(c->usage_percpu), &stats); bch2_usage_add(this_cpu_ptr(c->usage_percpu), &stats);
@ -619,6 +593,8 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
s64 sectors, bool metadata, struct gc_pos gc_pos, s64 sectors, bool metadata, struct gc_pos gc_pos,
struct bch_fs_usage *stats, u64 journal_seq) struct bch_fs_usage *stats, u64 journal_seq)
{ {
unsigned flags = gc_will_visit(c, gc_pos)
? BCH_BUCKET_MARK_GC_WILL_VISIT : 0;
/* /*
* synchronization w.r.t. GC: * synchronization w.r.t. GC:
* *
@ -647,9 +623,7 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
* (e.g. the btree node lock, or the relevant allocator lock). * (e.g. the btree node lock, or the relevant allocator lock).
*/ */
lg_local_lock(&c->usage_lock); lg_local_lock(&c->usage_lock);
__bch2_mark_key(c, k, sectors, metadata, false, stats, __bch2_mark_key(c, k, sectors, metadata, stats, journal_seq, flags);
gc_will_visit(c, gc_pos), journal_seq);
bch2_fs_stats_verify(c); bch2_fs_stats_verify(c);
lg_local_unlock(&c->usage_lock); lg_local_unlock(&c->usage_lock);
} }

View File

@ -124,9 +124,9 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca,
{ {
return max_t(s64, 0, return max_t(s64, 0,
ca->mi.nbuckets - ca->mi.first_bucket - ca->mi.nbuckets - ca->mi.first_bucket -
stats.buckets_dirty - stats.buckets[S_META] -
stats.buckets_alloc - stats.buckets[S_DIRTY] -
stats.buckets_meta); stats.buckets_alloc);
} }
/* /*
@ -157,16 +157,31 @@ struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
struct disk_reservation *, struct gc_pos); struct disk_reservation *, struct gc_pos);
struct fs_usage_sum {
u64 data;
u64 reserved;
};
static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
{
struct fs_usage_sum sum = { 0 };
unsigned i;
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
sum.data += (stats.s[i].data[S_META] +
stats.s[i].data[S_DIRTY]) * (i + 1);
sum.reserved += stats.s[i].persistent_reserved * (i + 1);
}
sum.reserved += stats.online_reserved;
return sum;
}
static inline u64 __bch2_fs_sectors_used(struct bch_fs *c) static inline u64 __bch2_fs_sectors_used(struct bch_fs *c)
{ {
struct bch_fs_usage stats = __bch2_fs_usage_read(c); struct fs_usage_sum sum = __fs_usage_sum(__bch2_fs_usage_read(c));
u64 reserved = stats.persistent_reserved +
stats.online_reserved;
return stats.s[S_COMPRESSED][S_META] + return sum.data + sum.reserved + (sum.reserved >> 7);
stats.s[S_COMPRESSED][S_DIRTY] +
reserved +
(reserved >> 7);
} }
static inline u64 bch2_fs_sectors_used(struct bch_fs *c) static inline u64 bch2_fs_sectors_used(struct bch_fs *c)
@ -199,9 +214,15 @@ void bch2_mark_alloc_bucket(struct bch_dev *, struct bucket *, bool);
void bch2_mark_metadata_bucket(struct bch_dev *, struct bucket *, void bch2_mark_metadata_bucket(struct bch_dev *, struct bucket *,
enum bucket_data_type, bool); enum bucket_data_type, bool);
void __bch2_gc_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, #define BCH_BUCKET_MARK_NOATOMIC (1 << 0)
struct bch_fs_usage *); #define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 1)
void bch2_gc_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool); #define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 2)
void __bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool,
struct bch_fs_usage *, u64, unsigned);
void bch2_gc_mark_key(struct bch_fs *, struct bkey_s_c,
s64, bool, unsigned);
void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool,
struct gc_pos, struct bch_fs_usage *, u64); struct gc_pos, struct bch_fs_usage *, u64);

View File

@ -7,7 +7,6 @@
enum bucket_data_type { enum bucket_data_type {
BUCKET_DATA = 0, BUCKET_DATA = 0,
BUCKET_BTREE, BUCKET_BTREE,
BUCKET_PRIOS,
BUCKET_JOURNAL, BUCKET_JOURNAL,
BUCKET_SB, BUCKET_SB,
}; };
@ -49,32 +48,33 @@ struct bucket {
}; };
}; };
enum s_compressed { /* kill, switch to bucket_data_type */
S_COMPRESSED,
S_UNCOMPRESSED,
S_COMPRESSED_NR,
};
enum s_alloc { enum s_alloc {
S_META, S_META,
S_DIRTY, S_DIRTY,
S_CACHED,
S_ALLOC_NR, S_ALLOC_NR,
}; };
struct bch_dev_usage { struct bch_dev_usage {
u64 buckets_dirty; u64 buckets[S_ALLOC_NR];
u64 buckets_cached; u64 buckets_cached;
u64 buckets_meta;
u64 buckets_alloc; u64 buckets_alloc;
/* _compressed_ sectors: */
u64 sectors[S_ALLOC_NR]; u64 sectors[S_ALLOC_NR];
u64 sectors_cached;
}; };
struct bch_fs_usage { struct bch_fs_usage {
/* all fields are in units of 512 byte sectors: */ /* all fields are in units of 512 byte sectors: */
u64 s[S_COMPRESSED_NR][S_ALLOC_NR];
u64 persistent_reserved; /* _uncompressed_ sectors: */
struct {
u64 data[S_ALLOC_NR];
u64 persistent_reserved;
} s[BCH_REPLICAS_MAX];
u64 online_reserved; u64 online_reserved;
u64 available_cache; u64 available_cache;
}; };

View File

@ -73,12 +73,12 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
return -EINVAL; return -EINVAL;
user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL); user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
if (!devs) if (!user_devs)
return -ENOMEM; return -ENOMEM;
devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
if (copy_from_user(user_devs, user_arg->devs, if (copy_from_user(user_devs, arg.devs,
sizeof(u64) * arg.nr_devs)) sizeof(u64) * arg.nr_devs))
goto err; goto err;

View File

@ -71,7 +71,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
memcpy(n_ondisk, n_sorted, btree_bytes(c)); memcpy(n_ondisk, n_sorted, btree_bytes(c));
bch2_btree_node_read_done(c, v, pick.ca, &pick.ptr); bch2_btree_node_read_done(c, v);
n_sorted = c->verify_data->data; n_sorted = c->verify_data->data;
percpu_ref_put(&pick.ca->io_ref); percpu_ref_put(&pick.ca->io_ref);

View File

@ -26,7 +26,7 @@ void bch2_fatal_error(struct bch_fs *c)
bch_err(c, "emergency read only"); bch_err(c, "emergency read only");
} }
void bch2_nonfatal_io_error_work(struct work_struct *work) void bch2_io_error_work(struct work_struct *work)
{ {
struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
struct bch_fs *c = ca->fs; struct bch_fs *c = ca->fs;
@ -45,9 +45,9 @@ void bch2_nonfatal_io_error_work(struct work_struct *work)
mutex_unlock(&c->state_lock); mutex_unlock(&c->state_lock);
} }
void bch2_nonfatal_io_error(struct bch_dev *ca) void bch2_io_error(struct bch_dev *ca)
{ {
queue_work(system_long_wq, &ca->io_error_work); //queue_work(system_long_wq, &ca->io_error_work);
} }
#ifdef __KERNEL__ #ifdef __KERNEL__

View File

@ -179,63 +179,32 @@ do { \
_ret; \ _ret; \
}) })
#define bch2_dev_fatal_error(ca, ...) \
do { \
bch_err(ca, __VA_ARGS__); \
bch2_fatal_error(c); \
} while (0)
#define bch2_dev_fatal_io_error(ca, fmt, ...) \
do { \
printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \
"fatal IO error on %s for " fmt), \
(ca)->name, ##__VA_ARGS__); \
bch2_fatal_error((ca)->fs); \
} while (0)
#define bch2_dev_fatal_io_err_on(cond, ca, ...) \
({ \
int _ret = !!(cond); \
\
if (_ret) \
bch2_dev_fatal_io_error(ca, __VA_ARGS__); \
_ret; \
})
/* /*
* Nonfatal IO errors: either recoverable metadata IO (because we have * IO errors: either recoverable metadata IO (because we have replicas), or data
* replicas), or data IO - we need to log it and print out a message, but we * IO - we need to log it and print out a message, but we don't (necessarily)
* don't (necessarily) want to shut down the fs: * want to shut down the fs:
*/ */
void bch2_nonfatal_io_error_work(struct work_struct *); void bch2_io_error_work(struct work_struct *);
/* Does the error handling without logging a message */ /* Does the error handling without logging a message */
void bch2_nonfatal_io_error(struct bch_dev *); void bch2_io_error(struct bch_dev *);
#if 0
#define bch2_fs_nonfatal_io_error(c, ...) \
do { \
bch_err(c, __VA_ARGS__); \
bch2_nonfatal_io_error(c); \
} while (0)
#endif
/* Logs message and handles the error: */ /* Logs message and handles the error: */
#define bch2_dev_nonfatal_io_error(ca, fmt, ...) \ #define bch2_dev_io_error(ca, fmt, ...) \
do { \ do { \
printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \ printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \
"IO error on %s for " fmt), \ "IO error on %s for " fmt), \
(ca)->name, ##__VA_ARGS__); \ (ca)->name, ##__VA_ARGS__); \
bch2_nonfatal_io_error(ca); \ bch2_io_error(ca); \
} while (0) } while (0)
#define bch2_dev_nonfatal_io_err_on(cond, ca, ...) \ #define bch2_dev_io_err_on(cond, ca, ...) \
({ \ ({ \
bool _ret = (cond); \ bool _ret = (cond); \
\ \
if (_ret) \ if (_ret) \
bch2_dev_nonfatal_io_error(ca, __VA_ARGS__); \ bch2_dev_io_error(ca, __VA_ARGS__); \
_ret; \ _ret; \
}) })

View File

@ -9,6 +9,8 @@
#include "bkey_methods.h" #include "bkey_methods.h"
#include "btree_gc.h" #include "btree_gc.h"
#include "btree_update.h" #include "btree_update.h"
#include "btree_update_interior.h"
#include "buckets.h"
#include "checksum.h" #include "checksum.h"
#include "debug.h" #include "debug.h"
#include "dirent.h" #include "dirent.h"
@ -497,6 +499,54 @@ out:
return out - buf; return out - buf;
} }
void bch2_get_read_device(struct bch_fs *c,
const struct bkey *k,
const struct bch_extent_ptr *ptr,
const union bch_extent_crc *crc,
struct bch_devs_mask *avoid,
struct extent_pick_ptr *pick)
{
struct bch_dev *ca = c->devs[ptr->dev];
if (ptr->cached && ptr_stale(ca, ptr))
return;
if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
return;
if (avoid && test_bit(ca->dev_idx, avoid->d))
return;
if (pick->ca && pick->ca->mi.tier < ca->mi.tier)
return;
if (!percpu_ref_tryget(&ca->io_ref))
return;
if (pick->ca)
percpu_ref_put(&pick->ca->io_ref);
*pick = (struct extent_pick_ptr) {
.ptr = *ptr,
.ca = ca,
};
if (k->size)
pick->crc = crc_to_128(k, crc);
}
static void extent_pick_read_device(struct bch_fs *c,
struct bkey_s_c_extent e,
struct bch_devs_mask *avoid,
struct extent_pick_ptr *pick)
{
const union bch_extent_crc *crc;
const struct bch_extent_ptr *ptr;
extent_for_each_ptr_crc(e, ptr, crc)
bch2_get_read_device(c, e.k, ptr, crc, avoid, pick);
}
/* Btree ptrs */ /* Btree ptrs */
static const char *bch2_btree_ptr_invalid(const struct bch_fs *c, static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
@ -615,36 +665,10 @@ static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
struct extent_pick_ptr struct extent_pick_ptr
bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b) bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b)
{ {
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
const union bch_extent_crc *crc;
const struct bch_extent_ptr *ptr;
struct extent_pick_ptr pick = { .ca = NULL }; struct extent_pick_ptr pick = { .ca = NULL };
extent_for_each_ptr_crc(e, ptr, crc) { extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
struct bch_dev *ca = c->devs[ptr->dev]; NULL, &pick);
struct btree *root = btree_node_root(c, b);
if (bch2_fs_inconsistent_on(crc, c,
"btree node pointer with crc at btree %u level %u/%u bucket %zu",
b->btree_id, b->level, root ? root->level : -1,
PTR_BUCKET_NR(ca, ptr)))
break;
if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
continue;
if (pick.ca && pick.ca->mi.tier < ca->mi.tier)
continue;
if (!percpu_ref_tryget(&ca->io_ref))
continue;
if (pick.ca)
percpu_ref_put(&pick.ca->io_ref);
pick.ca = ca;
pick.ptr = *ptr;
}
return pick; return pick;
} }
@ -2029,13 +2053,11 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
* as the pointers are sorted by tier, hence preferring pointers to tier 0 * as the pointers are sorted by tier, hence preferring pointers to tier 0
* rather than pointers to tier 1. * rather than pointers to tier 1.
*/ */
void bch2_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k, void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
struct bch_dev *avoid, struct bch_devs_mask *avoid,
struct extent_pick_ptr *ret) struct extent_pick_ptr *ret)
{ {
struct bkey_s_c_extent e; struct bkey_s_c_extent e;
const union bch_extent_crc *crc;
const struct bch_extent_ptr *ptr;
switch (k.k->type) { switch (k.k->type) {
case KEY_TYPE_DELETED: case KEY_TYPE_DELETED:
@ -2053,32 +2075,7 @@ void bch2_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k,
e = bkey_s_c_to_extent(k); e = bkey_s_c_to_extent(k);
ret->ca = NULL; ret->ca = NULL;
extent_for_each_ptr_crc(e, ptr, crc) { extent_pick_read_device(c, bkey_s_c_to_extent(k), avoid, ret);
struct bch_dev *ca = c->devs[ptr->dev];
if (ptr->cached && ptr_stale(ca, ptr))
continue;
if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
continue;
if (ret->ca &&
(ca == avoid ||
ret->ca->mi.tier < ca->mi.tier))
continue;
if (!percpu_ref_tryget(&ca->io_ref))
continue;
if (ret->ca)
percpu_ref_put(&ret->ca->io_ref);
*ret = (struct extent_pick_ptr) {
.crc = crc_to_128(e.k, crc),
.ptr = *ptr,
.ca = ca,
};
}
if (!ret->ca && !bkey_extent_is_cached(e.k)) if (!ret->ca && !bkey_extent_is_cached(e.k))
ret->ca = ERR_PTR(-EIO); ret->ca = ERR_PTR(-EIO);

View File

@ -3,11 +3,16 @@
#include "bcachefs.h" #include "bcachefs.h"
#include "bkey.h" #include "bkey.h"
#include "io_types.h"
struct bch_fs;
struct journal_res;
struct btree_node_iter; struct btree_node_iter;
struct btree_insert; struct btree_insert;
struct btree_insert_entry; struct btree_insert_entry;
struct extent_insert_hook; struct extent_insert_hook;
struct bch_devs_mask;
union bch_extent_crc;
struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *, struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *,
struct btree *, struct btree *,
@ -20,27 +25,18 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
extern const struct bkey_ops bch2_bkey_btree_ops; extern const struct bkey_ops bch2_bkey_btree_ops;
extern const struct bkey_ops bch2_bkey_extent_ops; extern const struct bkey_ops bch2_bkey_extent_ops;
struct bch_fs; void bch2_get_read_device(struct bch_fs *,
struct journal_res; const struct bkey *,
const struct bch_extent_ptr *,
struct extent_pick_ptr { const union bch_extent_crc *,
struct bch_extent_crc128 crc; struct bch_devs_mask *,
struct bch_extent_ptr ptr; struct extent_pick_ptr *);
struct bch_dev *ca;
};
struct extent_pick_ptr struct extent_pick_ptr
bch2_btree_pick_ptr(struct bch_fs *, const struct btree *); bch2_btree_pick_ptr(struct bch_fs *, const struct btree *);
void bch2_extent_pick_ptr_avoiding(struct bch_fs *, struct bkey_s_c, void bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
struct bch_dev *, struct extent_pick_ptr *); struct bch_devs_mask *,
struct extent_pick_ptr *);
static inline void
bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
struct extent_pick_ptr *ret)
{
bch2_extent_pick_ptr_avoiding(c, k, NULL, ret);
}
enum btree_insert_ret enum btree_insert_ret
bch2_insert_fixup_extent(struct btree_insert *, bch2_insert_fixup_extent(struct btree_insert *,
@ -558,6 +554,12 @@ void bch2_extent_drop_ptr_idx(struct bkey_s_extent, unsigned);
const struct bch_extent_ptr * const struct bch_extent_ptr *
bch2_extent_has_device(struct bkey_s_c_extent, unsigned); bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
struct bch_extent_ptr *
bch2_extent_find_ptr(struct bch_fs *, struct bkey_s_extent,
struct bch_extent_ptr);
struct bch_extent_ptr *
bch2_extent_find_matching_ptr(struct bch_fs *, struct bkey_s_extent,
struct bkey_s_c_extent);
bool bch2_cut_front(struct bpos, struct bkey_i *); bool bch2_cut_front(struct bpos, struct bkey_i *);
bool bch2_cut_back(struct bpos, struct bkey *); bool bch2_cut_back(struct bpos, struct bkey *);

View File

@ -21,6 +21,8 @@
#include <linux/task_io_accounting_ops.h> #include <linux/task_io_accounting_ops.h>
#include <linux/uio.h> #include <linux/uio.h>
#include <linux/writeback.h> #include <linux/writeback.h>
#include <trace/events/bcachefs.h>
#include <trace/events/writeback.h> #include <trace/events/writeback.h>
struct bio_set *bch2_writepage_bioset; struct bio_set *bch2_writepage_bioset;
@ -700,8 +702,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
{ {
struct bio *bio = &rbio->bio; struct bio *bio = &rbio->bio;
int flags = BCH_READ_RETRY_IF_STALE| int flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_PROMOTE| BCH_READ_MAY_PROMOTE;
BCH_READ_MAY_REUSE_BIO;
while (1) { while (1) {
struct extent_pick_ptr pick; struct extent_pick_ptr pick;
@ -727,7 +728,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_iter_unlock(iter); bch2_btree_iter_unlock(iter);
k = bkey_i_to_s_c(&tmp.k); k = bkey_i_to_s_c(&tmp.k);
bch2_extent_pick_ptr(c, k, &pick); bch2_extent_pick_ptr(c, k, NULL, &pick);
if (IS_ERR(pick.ca)) { if (IS_ERR(pick.ca)) {
bcache_io_error(c, bio, "no device to read from"); bcache_io_error(c, bio, "no device to read from");
bio_endio(bio); bio_endio(bio);
@ -753,15 +754,14 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
bkey_extent_is_compressed(k)) bkey_extent_is_compressed(k))
bch2_mark_pages_unalloc(bio); bch2_mark_pages_unalloc(bio);
if (is_last)
flags |= BCH_READ_IS_LAST;
if (pick.ca) { if (pick.ca) {
PTR_BUCKET(pick.ca, &pick.ptr)->prio[READ] = if (!is_last) {
c->prio_clock[READ].hand; bio_inc_remaining(&rbio->bio);
flags |= BCH_READ_MUST_CLONE;
trace_read_split(&rbio->bio);
}
bch2_read_extent(c, rbio, k, &pick, flags); bch2_read_extent(c, rbio, k, &pick, flags);
flags &= ~BCH_READ_MAY_REUSE_BIO;
} else { } else {
zero_fill_bio(bio); zero_fill_bio(bio);
@ -803,9 +803,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
BCH_ENCODED_EXTENT_MAX >> PAGE_SECTOR_SHIFT); BCH_ENCODED_EXTENT_MAX >> PAGE_SECTOR_SHIFT);
struct bch_read_bio *rbio = struct bch_read_bio *rbio =
container_of(bio_alloc_bioset(GFP_NOFS, n, to_rbio(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read));
&c->bio_read),
struct bch_read_bio, bio);
rbio->bio.bi_end_io = bch2_readpages_end_io; rbio->bio.bi_end_io = bch2_readpages_end_io;
bio_add_page_contig(&rbio->bio, page); bio_add_page_contig(&rbio->bio, page);
@ -854,9 +852,7 @@ int bch2_readpage(struct file *file, struct page *page)
struct bch_fs *c = inode->i_sb->s_fs_info; struct bch_fs *c = inode->i_sb->s_fs_info;
struct bch_read_bio *rbio; struct bch_read_bio *rbio;
rbio = container_of(bio_alloc_bioset(GFP_NOFS, 1, rbio = to_rbio(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read));
&c->bio_read),
struct bch_read_bio, bio);
rbio->bio.bi_end_io = bch2_readpages_end_io; rbio->bio.bi_end_io = bch2_readpages_end_io;
__bchfs_readpage(c, rbio, inode->i_ino, page); __bchfs_readpage(c, rbio, inode->i_ino, page);
@ -1240,9 +1236,7 @@ static int bch2_read_single_page(struct page *page,
int ret; int ret;
DECLARE_COMPLETION_ONSTACK(done); DECLARE_COMPLETION_ONSTACK(done);
rbio = container_of(bio_alloc_bioset(GFP_NOFS, 1, rbio = to_rbio(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read));
&c->bio_read),
struct bch_read_bio, bio);
rbio->bio.bi_private = &done; rbio->bio.bi_private = &done;
rbio->bio.bi_end_io = bch2_read_single_page_end_io; rbio->bio.bi_end_io = bch2_read_single_page_end_io;
@ -1464,9 +1458,7 @@ start:
if (iter->count) if (iter->count)
closure_get(&dio->cl); closure_get(&dio->cl);
bch2_read(c, container_of(bio, bch2_read(c, to_rbio(bio), inode->i_ino);
struct bch_read_bio, bio),
inode->i_ino);
} }
if (sync) { if (sync) {
@ -2088,13 +2080,14 @@ static long bch2_fpunch(struct inode *inode, loff_t offset, loff_t len)
if (unlikely(ret)) if (unlikely(ret))
goto out; goto out;
ret = bch2_discard(c, ret = bch2_btree_delete_range(c,
POS(ino, discard_start), BTREE_ID_EXTENTS,
POS(ino, discard_end), POS(ino, discard_start),
ZERO_VERSION, POS(ino, discard_end),
&disk_res, ZERO_VERSION,
&i_sectors_hook.hook, &disk_res,
&ei->journal_seq); &i_sectors_hook.hook,
&ei->journal_seq);
i_sectors_dirty_put(ei, &i_sectors_hook); i_sectors_dirty_put(ei, &i_sectors_hook);
bch2_disk_reservation_put(c, &disk_res); bch2_disk_reservation_put(c, &disk_res);

View File

@ -328,8 +328,11 @@ again:
int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size, int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size,
struct extent_insert_hook *hook, u64 *journal_seq) struct extent_insert_hook *hook, u64 *journal_seq)
{ {
return bch2_discard(c, POS(inode_nr, new_size), POS(inode_nr + 1, 0), return bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
ZERO_VERSION, NULL, hook, journal_seq); POS(inode_nr, new_size),
POS(inode_nr + 1, 0),
ZERO_VERSION, NULL, hook,
journal_seq);
} }
int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)

File diff suppressed because it is too large Load Diff

View File

@ -13,18 +13,20 @@
void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
const struct bkey_i *);
enum bch_write_flags { enum bch_write_flags {
BCH_WRITE_ALLOC_NOWAIT = (1 << 0), BCH_WRITE_ALLOC_NOWAIT = (1 << 0),
BCH_WRITE_DISCARD = (1 << 1), BCH_WRITE_CACHED = (1 << 1),
BCH_WRITE_CACHED = (1 << 2), BCH_WRITE_FLUSH = (1 << 2),
BCH_WRITE_FLUSH = (1 << 3), BCH_WRITE_DATA_COMPRESSED = (1 << 3),
BCH_WRITE_DISCARD_ON_ERROR = (1 << 4),
BCH_WRITE_DATA_COMPRESSED = (1 << 5),
/* Internal: */ /* Internal: */
BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 6), BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 4),
BCH_WRITE_DONE = (1 << 7), BCH_WRITE_DONE = (1 << 5),
BCH_WRITE_LOOPED = (1 << 8), BCH_WRITE_LOOPED = (1 << 6),
__BCH_WRITE_KEYLIST_LOCKED = 8,
}; };
static inline u64 *op_journal_seq(struct bch_write_op *op) static inline u64 *op_journal_seq(struct bch_write_op *op)
@ -53,43 +55,54 @@ static inline struct bch_write_bio *wbio_init(struct bio *bio)
return wbio; return wbio;
} }
struct cache_promote_op; void bch2_wake_delayed_writes(unsigned long data);
struct bch_devs_mask;
struct cache_promote_op;
struct extent_pick_ptr; struct extent_pick_ptr;
void bch2_read_extent_iter(struct bch_fs *, struct bch_read_bio *, int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
struct bvec_iter, struct bkey_s_c k, struct bkey_s_c k, struct extent_pick_ptr *, unsigned);
struct extent_pick_ptr *, unsigned); void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
u64, struct bch_devs_mask *, unsigned);
enum bch_read_flags {
BCH_READ_RETRY_IF_STALE = 1 << 0,
BCH_READ_MAY_PROMOTE = 1 << 1,
BCH_READ_USER_MAPPED = 1 << 2,
/* internal: */
BCH_READ_MUST_BOUNCE = 1 << 3,
BCH_READ_MUST_CLONE = 1 << 4,
BCH_READ_IN_RETRY = 1 << 5,
};
static inline void bch2_read_extent(struct bch_fs *c, static inline void bch2_read_extent(struct bch_fs *c,
struct bch_read_bio *orig, struct bch_read_bio *rbio,
struct bkey_s_c k, struct bkey_s_c k,
struct extent_pick_ptr *pick, struct extent_pick_ptr *pick,
unsigned flags) unsigned flags)
{ {
bch2_read_extent_iter(c, orig, orig->bio.bi_iter, rbio->_state = 0;
k, pick, flags); __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, pick, flags);
} }
enum bch_read_flags { static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
BCH_READ_FORCE_BOUNCE = 1 << 0, u64 inode)
BCH_READ_RETRY_IF_STALE = 1 << 1, {
BCH_READ_PROMOTE = 1 << 2, rbio->_state = 0;
BCH_READ_IS_LAST = 1 << 3, __bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL,
BCH_READ_MAY_REUSE_BIO = 1 << 4, BCH_READ_RETRY_IF_STALE|
BCH_READ_USER_MAPPED = 1 << 5, BCH_READ_MAY_PROMOTE|
}; BCH_READ_USER_MAPPED);
}
void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); static inline struct bch_read_bio *rbio_init(struct bio *bio)
{
struct bch_read_bio *rbio = to_rbio(bio);
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, rbio->_state = 0;
const struct bkey_i *); return rbio;
}
int bch2_discard(struct bch_fs *, struct bpos, struct bpos,
struct bversion, struct disk_reservation *,
struct extent_insert_hook *, u64 *);
void bch2_read_retry_work(struct work_struct *);
void bch2_wake_delayed_writes(unsigned long data);
#endif /* _BCACHE_IO_H */ #endif /* _BCACHE_IO_H */

View File

@ -4,11 +4,20 @@
#include "btree_types.h" #include "btree_types.h"
#include "buckets_types.h" #include "buckets_types.h"
#include "keylist_types.h" #include "keylist_types.h"
#include "super_types.h"
#include <linux/llist.h> #include <linux/llist.h>
#include <linux/workqueue.h> #include <linux/workqueue.h>
struct extent_pick_ptr {
struct bch_extent_crc128 crc;
struct bch_extent_ptr ptr;
struct bch_dev *ca;
};
struct bch_read_bio { struct bch_read_bio {
struct bch_fs *c;
/* /*
* Reads will often have to be split, and if the extent being read from * Reads will often have to be split, and if the extent being read from
* was checksummed or compressed we'll also have to allocate bounce * was checksummed or compressed we'll also have to allocate bounce
@ -19,33 +28,37 @@ struct bch_read_bio {
*/ */
union { union {
struct bch_read_bio *parent; struct bch_read_bio *parent;
bio_end_io_t *orig_bi_end_io; bio_end_io_t *end_io;
}; };
/* /*
* Saved copy of parent->bi_iter, from submission time - allows us to * Saved copy of bio->bi_iter, from submission time - allows us to
* resubmit on IO error, and also to copy data back to the original bio * resubmit on IO error, and also to copy data back to the original bio
* when we're bouncing: * when we're bouncing:
*/ */
struct bvec_iter parent_iter; struct bvec_iter bvec_iter;
unsigned submit_time_us; unsigned submit_time_us;
u16 flags; u8 flags;
union {
struct {
u8 bounce:1, u8 bounce:1,
split:1; split:1,
process_context:1,
retry:2;
};
u8 _state;
};
struct bch_fs *c; struct extent_pick_ptr pick;
struct bch_dev *ca;
struct bch_extent_ptr ptr;
struct bch_extent_crc128 crc;
struct bversion version; struct bversion version;
struct cache_promote_op *promote; struct promote_op *promote;
/* /*
* If we have to retry the read (IO error, checksum failure, read stale * If we have to retry the read (IO error, checksum failure, read stale
* data (raced with allocator), we retry the portion of the parent bio * data (raced with allocator), we retry the portion of the parent bio
* that failed (i.e. this bio's portion, parent_iter). * that failed (i.e. this bio's portion, bvec_iter).
* *
* But we need to stash the inode somewhere: * But we need to stash the inode somewhere:
*/ */
@ -56,12 +69,6 @@ struct bch_read_bio {
struct bio bio; struct bio bio;
}; };
static inline struct bch_read_bio *
bch2_rbio_parent(struct bch_read_bio *rbio)
{
return rbio->split ? rbio->parent : rbio;
}
struct bch_write_bio { struct bch_write_bio {
struct bch_fs *c; struct bch_fs *c;
struct bch_dev *ca; struct bch_dev *ca;
@ -132,6 +139,8 @@ struct bch_write_op {
int (*index_update_fn)(struct bch_write_op *); int (*index_update_fn)(struct bch_write_op *);
struct bch_devs_mask failed;
struct keylist insert_keys; struct keylist insert_keys;
u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2];

View File

@ -10,6 +10,7 @@
#include "buckets.h" #include "buckets.h"
#include "btree_gc.h" #include "btree_gc.h"
#include "btree_update.h" #include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_io.h" #include "btree_io.h"
#include "checksum.h" #include "checksum.h"
#include "debug.h" #include "debug.h"
@ -150,7 +151,7 @@ static void journal_seq_blacklist_flush(struct journal *j,
} }
for (i = 0;; i++) { for (i = 0;; i++) {
struct btree_interior_update *as; struct btree_update *as;
struct pending_btree_node_free *d; struct pending_btree_node_free *d;
mutex_lock(&j->blacklist_lock); mutex_lock(&j->blacklist_lock);
@ -673,9 +674,9 @@ reread: sectors_read = min_t(unsigned,
ret = submit_bio_wait(bio); ret = submit_bio_wait(bio);
if (bch2_dev_fatal_io_err_on(ret, ca, if (bch2_dev_io_err_on(ret, ca,
"journal read from sector %llu", "journal read from sector %llu",
offset) || offset) ||
bch2_meta_read_fault("journal")) bch2_meta_read_fault("journal"))
return -EIO; return -EIO;
@ -1086,7 +1087,6 @@ static bool journal_entry_is_open(struct journal *j)
void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set) void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
{ {
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *w = journal_prev_buf(j); struct journal_buf *w = journal_prev_buf(j);
atomic_dec_bug(&journal_seq_pin(j, w->data->seq)->count); atomic_dec_bug(&journal_seq_pin(j, w->data->seq)->count);
@ -1096,10 +1096,10 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
__bch2_time_stats_update(j->delay_time, __bch2_time_stats_update(j->delay_time,
j->need_write_time); j->need_write_time);
#if 0 #if 0
closure_call(&j->io, journal_write, NULL, &c->cl); closure_call(&j->io, journal_write, NULL, NULL);
#else #else
/* Shut sparse up: */ /* Shut sparse up: */
closure_init(&j->io, &c->cl); closure_init(&j->io, NULL);
set_closure_fn(&j->io, journal_write, NULL); set_closure_fn(&j->io, journal_write, NULL);
journal_write(&j->io); journal_write(&j->io);
#endif #endif
@ -1734,13 +1734,11 @@ void bch2_journal_pin_drop(struct journal *j,
struct journal_entry_pin *pin) struct journal_entry_pin *pin)
{ {
unsigned long flags; unsigned long flags;
bool wakeup; bool wakeup = false;
if (!journal_pin_active(pin))
return;
spin_lock_irqsave(&j->pin_lock, flags); spin_lock_irqsave(&j->pin_lock, flags);
wakeup = __journal_pin_drop(j, pin); if (journal_pin_active(pin))
wakeup = __journal_pin_drop(j, pin);
spin_unlock_irqrestore(&j->pin_lock, flags); spin_unlock_irqrestore(&j->pin_lock, flags);
/* /*
@ -2099,60 +2097,6 @@ static void journal_write_compact(struct jset *jset)
jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
} }
static void journal_write_endio(struct bio *bio)
{
struct bch_dev *ca = bio->bi_private;
struct journal *j = &ca->fs->journal;
if (bch2_dev_fatal_io_err_on(bio->bi_error, ca, "journal write") ||
bch2_meta_write_fault("journal"))
bch2_journal_halt(j);
closure_put(&j->io);
percpu_ref_put(&ca->io_ref);
}
static void journal_write_done(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
struct journal_buf *w = journal_prev_buf(j);
__bch2_time_stats_update(j->write_time, j->write_start_time);
j->last_seq_ondisk = le64_to_cpu(w->data->last_seq);
/*
* Updating last_seq_ondisk may let journal_reclaim_work() discard more
* buckets:
*
* Must come before signaling write completion, for
* bch2_fs_journal_stop():
*/
mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
BUG_ON(!j->reservations.prev_buf_unwritten);
atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
&j->reservations.counter);
/*
* XXX: this is racy, we could technically end up doing the wake up
* after the journal_buf struct has been reused for the next write
* (because we're clearing JOURNAL_IO_IN_FLIGHT) and wake up things that
* are waiting on the _next_ write, not this one.
*
* The wake up can't come before, because journal_flush_seq_async() is
* looking at JOURNAL_IO_IN_FLIGHT when it has to wait on a journal
* write that was already in flight.
*
* The right fix is to use a lock here, but using j.lock here means it
* has to be a spin_lock_irqsave() lock which then requires propagating
* the irq()ness to other locks and it's all kinds of nastiness.
*/
closure_wake_up(&w->wait);
wake_up(&j->wait);
}
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
{ {
/* we aren't holding j->lock: */ /* we aren't holding j->lock: */
@ -2172,6 +2116,89 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
buf->size = new_size; buf->size = new_size;
} }
static void journal_write_done(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
struct journal_buf *w = journal_prev_buf(j);
__bch2_time_stats_update(j->write_time, j->write_start_time);
spin_lock(&j->lock);
j->last_seq_ondisk = le64_to_cpu(w->data->last_seq);
/*
* Updating last_seq_ondisk may let journal_reclaim_work() discard more
* buckets:
*
* Must come before signaling write completion, for
* bch2_fs_journal_stop():
*/
mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
/* also must come before signalling write completion: */
closure_debug_destroy(cl);
BUG_ON(!j->reservations.prev_buf_unwritten);
atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
&j->reservations.counter);
closure_wake_up(&w->wait);
wake_up(&j->wait);
if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
mod_delayed_work(system_freezable_wq, &j->write_work, 0);
spin_unlock(&j->lock);
}
static void journal_write_error(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
while (j->replicas_failed) {
unsigned idx = __fls(j->replicas_failed);
bch2_extent_drop_ptr_idx(e, idx);
j->replicas_failed ^= 1 << idx;
}
if (!bch2_extent_nr_ptrs(e.c)) {
bch_err(c, "unable to write journal to sufficient devices");
goto err;
}
if (bch2_check_mark_super(c, e.c, BCH_DATA_JOURNAL))
goto err;
out:
journal_write_done(cl);
return;
err:
bch2_fatal_error(c);
bch2_journal_halt(j);
goto out;
}
static void journal_write_endio(struct bio *bio)
{
struct bch_dev *ca = bio->bi_private;
struct journal *j = &ca->fs->journal;
if (bch2_dev_io_err_on(bio->bi_error, ca, "journal write") ||
bch2_meta_write_fault("journal")) {
/* Was this a flush or an actual journal write? */
if (ca->journal.ptr_idx != U8_MAX) {
set_bit(ca->journal.ptr_idx, &j->replicas_failed);
set_closure_fn(&j->io, journal_write_error,
system_highpri_wq);
}
}
closure_put(&j->io);
percpu_ref_put(&ca->io_ref);
}
static void journal_write(struct closure *cl) static void journal_write(struct closure *cl)
{ {
struct journal *j = container_of(cl, struct journal, io); struct journal *j = container_of(cl, struct journal, io);
@ -2181,7 +2208,7 @@ static void journal_write(struct closure *cl)
struct jset *jset; struct jset *jset;
struct bio *bio; struct bio *bio;
struct bch_extent_ptr *ptr; struct bch_extent_ptr *ptr;
unsigned i, sectors, bytes; unsigned i, sectors, bytes, ptr_idx = 0;
journal_buf_realloc(j, w); journal_buf_realloc(j, w);
jset = w->data; jset = w->data;
@ -2231,7 +2258,7 @@ static void journal_write(struct closure *cl)
bch2_journal_halt(j); bch2_journal_halt(j);
bch_err(c, "Unable to allocate journal write"); bch_err(c, "Unable to allocate journal write");
bch2_fatal_error(c); bch2_fatal_error(c);
closure_return_with_destructor(cl, journal_write_done); continue_at(cl, journal_write_done, system_highpri_wq);
} }
if (bch2_check_mark_super(c, bkey_i_to_s_c_extent(&j->key), if (bch2_check_mark_super(c, bkey_i_to_s_c_extent(&j->key),
@ -2255,6 +2282,7 @@ static void journal_write(struct closure *cl)
atomic64_add(sectors, &ca->meta_sectors_written); atomic64_add(sectors, &ca->meta_sectors_written);
ca->journal.ptr_idx = ptr_idx++;
bio = ca->journal.bio; bio = ca->journal.bio;
bio_reset(bio); bio_reset(bio);
bio->bi_iter.bi_sector = ptr->offset; bio->bi_iter.bi_sector = ptr->offset;
@ -2277,6 +2305,7 @@ static void journal_write(struct closure *cl)
!bch2_extent_has_device(bkey_i_to_s_c_extent(&j->key), i)) { !bch2_extent_has_device(bkey_i_to_s_c_extent(&j->key), i)) {
percpu_ref_get(&ca->io_ref); percpu_ref_get(&ca->io_ref);
ca->journal.ptr_idx = U8_MAX;
bio = ca->journal.bio; bio = ca->journal.bio;
bio_reset(bio); bio_reset(bio);
bio->bi_bdev = ca->disk_sb.bdev; bio->bi_bdev = ca->disk_sb.bdev;
@ -2290,10 +2319,10 @@ no_io:
extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr)
ptr->offset += sectors; ptr->offset += sectors;
closure_return_with_destructor(cl, journal_write_done); continue_at(cl, journal_write_done, system_highpri_wq);
err: err:
bch2_inconsistent_error(c); bch2_inconsistent_error(c);
closure_return_with_destructor(cl, journal_write_done); continue_at(cl, journal_write_done, system_highpri_wq);
} }
static void journal_write_work(struct work_struct *work) static void journal_write_work(struct work_struct *work)
@ -2524,18 +2553,61 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa
spin_unlock(&j->lock); spin_unlock(&j->lock);
} }
static int journal_seq_flushed(struct journal *j, u64 seq)
{
struct journal_buf *buf;
int ret = 1;
spin_lock(&j->lock);
BUG_ON(seq > atomic64_read(&j->seq));
if (seq == atomic64_read(&j->seq)) {
bool set_need_write = false;
ret = 0;
buf = journal_cur_buf(j);
if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
j->need_write_time = local_clock();
set_need_write = true;
}
switch (journal_buf_switch(j, set_need_write)) {
case JOURNAL_ENTRY_ERROR:
ret = -EIO;
break;
case JOURNAL_ENTRY_CLOSED:
/*
* Journal entry hasn't been opened yet, but caller
* claims it has something (seq == j->seq):
*/
BUG();
case JOURNAL_ENTRY_INUSE:
break;
case JOURNAL_UNLOCKED:
return 0;
}
} else if (seq + 1 == atomic64_read(&j->seq) &&
j->reservations.prev_buf_unwritten) {
ret = bch2_journal_error(j);
}
spin_unlock(&j->lock);
return ret;
}
int bch2_journal_flush_seq(struct journal *j, u64 seq) int bch2_journal_flush_seq(struct journal *j, u64 seq)
{ {
struct closure cl;
u64 start_time = local_clock(); u64 start_time = local_clock();
int ret, ret2;
closure_init_stack(&cl); ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq)));
bch2_journal_flush_seq_async(j, seq, &cl);
closure_sync(&cl);
bch2_time_stats_update(j->flush_seq_time, start_time); bch2_time_stats_update(j->flush_seq_time, start_time);
return bch2_journal_error(j); return ret ?: ret2 < 0 ? ret2 : 0;
} }
void bch2_journal_meta_async(struct journal *j, struct closure *parent) void bch2_journal_meta_async(struct journal *j, struct closure *parent)

View File

@ -139,6 +139,7 @@ struct journal {
struct closure io; struct closure io;
struct delayed_work write_work; struct delayed_work write_work;
unsigned long replicas_failed;
/* Sequence number of most recent journal entry (last entry in @pin) */ /* Sequence number of most recent journal entry (last entry in @pin) */
atomic64_t seq; atomic64_t seq;
@ -227,6 +228,7 @@ struct journal_device {
/* Bio for journal reads/writes to this device */ /* Bio for journal reads/writes to this device */
struct bio *bio; struct bio *bio;
u8 ptr_idx;
/* for bch_journal_read_device */ /* for bch_journal_read_device */
struct closure read; struct closure read;

View File

@ -53,3 +53,14 @@ void bch2_keylist_pop_front(struct keylist *l)
bkey_next(l->keys), bkey_next(l->keys),
bch_keylist_u64s(l)); bch_keylist_u64s(l));
} }
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_verify_keylist_sorted(struct keylist *l)
{
struct bkey_i *k;
for_each_keylist_key(l, k)
BUG_ON(bkey_next(k) != l->top &&
bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
}
#endif

View File

@ -59,4 +59,10 @@ static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
#define keylist_single(k) \ #define keylist_single(k) \
((struct keylist) { .keys = k, .top = bkey_next(k) }) ((struct keylist) { .keys = k, .top = bkey_next(k) })
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_verify_keylist_sorted(struct keylist *);
#else
static inline void bch2_verify_keylist_sorted(struct keylist *l) {}
#endif
#endif /* _BCACHE_KEYLIST_H */ #endif /* _BCACHE_KEYLIST_H */

View File

@ -72,7 +72,7 @@ int bch2_move_data_off_device(struct bch_dev *ca)
bch2_replicas_gc_start(c, 1 << BCH_DATA_USER); bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
bch2_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE); bch2_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE);
ctxt.avoid = ca; __set_bit(ca->dev_idx, ctxt.avoid.d);
/* /*
* In theory, only one pass should be necessary as we've * In theory, only one pass should be necessary as we've

View File

@ -30,7 +30,7 @@ static struct bch_extent_ptr *bkey_find_ptr(struct bch_fs *c,
} }
static struct bch_extent_ptr *bch2_migrate_matching_ptr(struct migrate_write *m, static struct bch_extent_ptr *bch2_migrate_matching_ptr(struct migrate_write *m,
struct bkey_s_extent e) struct bkey_s_extent e)
{ {
const struct bch_extent_ptr *ptr; const struct bch_extent_ptr *ptr;
struct bch_extent_ptr *ret; struct bch_extent_ptr *ret;
@ -138,11 +138,11 @@ out:
} }
void bch2_migrate_write_init(struct bch_fs *c, void bch2_migrate_write_init(struct bch_fs *c,
struct migrate_write *m, struct migrate_write *m,
struct write_point *wp, struct write_point *wp,
struct bkey_s_c k, struct bkey_s_c k,
const struct bch_extent_ptr *move_ptr, const struct bch_extent_ptr *move_ptr,
unsigned flags) unsigned flags)
{ {
bkey_reassemble(&m->key, k); bkey_reassemble(&m->key, k);
@ -178,23 +178,18 @@ static void migrate_bio_init(struct moving_io *io, struct bio *bio,
bch2_bio_map(bio, NULL); bch2_bio_map(bio, NULL);
} }
static void moving_io_destructor(struct closure *cl) static void moving_io_free(struct moving_io *io)
{ {
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_context *ctxt = io->ctxt; struct moving_context *ctxt = io->ctxt;
struct bio_vec *bv; struct bio_vec *bv;
int i; int i;
//if (io->replace.failures)
// trace_copy_collision(q, &io->key.k);
atomic_sub(io->write.key.k.size, &ctxt->sectors_in_flight); atomic_sub(io->write.key.k.size, &ctxt->sectors_in_flight);
wake_up(&ctxt->wait); wake_up(&ctxt->wait);
bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i) bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
if (bv->bv_page) if (bv->bv_page)
__free_page(bv->bv_page); __free_page(bv->bv_page);
kfree(io); kfree(io);
} }
@ -204,27 +199,26 @@ static void moving_error(struct moving_context *ctxt, unsigned flag)
//atomic_or(flag, &ctxt->error_flags); //atomic_or(flag, &ctxt->error_flags);
} }
static void moving_io_after_write(struct closure *cl) static void moving_write_done(struct closure *cl)
{ {
struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_context *ctxt = io->ctxt;
if (io->write.op.error) if (io->write.op.error)
moving_error(ctxt, MOVING_FLAG_WRITE); moving_error(io->ctxt, MOVING_FLAG_WRITE);
moving_io_destructor(cl); //if (io->replace.failures)
// trace_copy_collision(q, &io->key.k);
moving_io_free(io);
} }
static void write_moving(struct moving_io *io) static void write_moving(struct closure *cl)
{ {
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct bch_write_op *op = &io->write.op; struct bch_write_op *op = &io->write.op;
if (op->error) { closure_call(&op->cl, bch2_write, NULL, &io->cl);
closure_return_with_destructor(&io->cl, moving_io_destructor); closure_return_with_destructor(&io->cl, moving_write_done);
} else {
closure_call(&op->cl, bch2_write, NULL, &io->cl);
closure_return_with_destructor(&io->cl, moving_io_after_write);
}
} }
static inline struct moving_io *next_pending_write(struct moving_context *ctxt) static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
@ -243,10 +237,8 @@ static void read_moving_endio(struct bio *bio)
trace_move_read_done(&io->write.key.k); trace_move_read_done(&io->write.key.k);
if (bio->bi_error) { if (bio->bi_error)
io->write.op.error = bio->bi_error;
moving_error(io->ctxt, MOVING_FLAG_READ); moving_error(io->ctxt, MOVING_FLAG_READ);
}
io->read_completed = true; io->read_completed = true;
if (next_pending_write(ctxt)) if (next_pending_write(ctxt))
@ -255,43 +247,21 @@ static void read_moving_endio(struct bio *bio)
closure_put(&ctxt->cl); closure_put(&ctxt->cl);
} }
static void __bch2_data_move(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct bch_fs *c = io->write.op.c;
struct extent_pick_ptr pick;
bch2_extent_pick_ptr_avoiding(c, bkey_i_to_s_c(&io->write.key),
io->ctxt->avoid, &pick);
if (IS_ERR_OR_NULL(pick.ca))
closure_return_with_destructor(cl, moving_io_destructor);
bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&io->write.key.k);
io->rbio.bio.bi_end_io = read_moving_endio;
/*
* dropped by read_moving_endio() - guards against use after free of
* ctxt when doing wakeup
*/
closure_get(&io->ctxt->cl);
bch2_read_extent(c, &io->rbio,
bkey_i_to_s_c(&io->write.key),
&pick, BCH_READ_IS_LAST);
}
int bch2_data_move(struct bch_fs *c, int bch2_data_move(struct bch_fs *c,
struct moving_context *ctxt, struct moving_context *ctxt,
struct write_point *wp, struct write_point *wp,
struct bkey_s_c k, struct bkey_s_c k,
const struct bch_extent_ptr *move_ptr) const struct bch_extent_ptr *move_ptr)
{ {
struct extent_pick_ptr pick;
struct moving_io *io; struct moving_io *io;
bch2_extent_pick_ptr(c, k, &ctxt->avoid, &pick);
if (IS_ERR_OR_NULL(pick.ca))
return pick.ca ? PTR_ERR(pick.ca) : 0;
io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) * io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) *
DIV_ROUND_UP(k.k->size, PAGE_SECTORS), DIV_ROUND_UP(k.k->size, PAGE_SECTORS), GFP_KERNEL);
GFP_KERNEL);
if (!io) if (!io)
return -ENOMEM; return -ENOMEM;
@ -299,6 +269,10 @@ int bch2_data_move(struct bch_fs *c,
migrate_bio_init(io, &io->rbio.bio, k.k->size); migrate_bio_init(io, &io->rbio.bio, k.k->size);
bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
io->rbio.bio.bi_end_io = read_moving_endio;
if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) { if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) {
kfree(io); kfree(io);
return -ENOMEM; return -ENOMEM;
@ -318,7 +292,12 @@ int bch2_data_move(struct bch_fs *c,
atomic_add(k.k->size, &ctxt->sectors_in_flight); atomic_add(k.k->size, &ctxt->sectors_in_flight);
list_add_tail(&io->list, &ctxt->reads); list_add_tail(&io->list, &ctxt->reads);
closure_call(&io->cl, __bch2_data_move, NULL, &ctxt->cl); /*
* dropped by read_moving_endio() - guards against use after free of
* ctxt when doing wakeup
*/
closure_get(&io->ctxt->cl);
bch2_read_extent(c, &io->rbio, k, &pick, 0);
return 0; return 0;
} }
@ -328,8 +307,14 @@ static void do_pending_writes(struct moving_context *ctxt)
while ((io = next_pending_write(ctxt))) { while ((io = next_pending_write(ctxt))) {
list_del(&io->list); list_del(&io->list);
if (io->rbio.bio.bi_error) {
moving_io_free(io);
continue;
}
trace_move_write(&io->write.key.k); trace_move_write(&io->write.key.k);
write_moving(io); closure_call(&io->cl, write_moving, NULL, &ctxt->cl);
} }
} }

View File

@ -46,7 +46,7 @@ struct moving_context {
struct bch_ratelimit *rate; struct bch_ratelimit *rate;
/* Try to avoid reading the following device */ /* Try to avoid reading the following device */
struct bch_dev *avoid; struct bch_devs_mask avoid;
struct list_head reads; struct list_head reads;

View File

@ -181,7 +181,7 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
if (val) { if (val) {
id = bch2_opt_lookup(name); id = bch2_opt_lookup(name);
if (id < 0) if (id < 0)
return -EINVAL; continue;
ret = parse_one_opt(id, val, &v); ret = parse_one_opt(id, val, &v);
if (ret < 0) if (ret < 0)
@ -196,8 +196,9 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
v = 0; v = 0;
} }
if (bch2_opt_table[id].type != BCH_OPT_BOOL) if (id < 0 ||
return -EINVAL; bch2_opt_table[id].type != BCH_OPT_BOOL)
continue;
} }
bch2_opt_set(opts, id, v); bch2_opt_set(opts, id, v);

View File

@ -700,23 +700,18 @@ static void write_super_endio(struct bio *bio)
/* XXX: return errors directly */ /* XXX: return errors directly */
bch2_dev_fatal_io_err_on(bio->bi_error, ca, "superblock write"); if (bch2_dev_io_err_on(bio->bi_error, ca, "superblock write"))
ca->sb_write_error = 1;
closure_put(&ca->fs->sb_write); closure_put(&ca->fs->sb_write);
percpu_ref_put(&ca->io_ref); percpu_ref_put(&ca->io_ref);
} }
static bool write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
{ {
struct bch_sb *sb = ca->disk_sb.sb; struct bch_sb *sb = ca->disk_sb.sb;
struct bio *bio = ca->disk_sb.bio; struct bio *bio = ca->disk_sb.bio;
if (idx >= sb->layout.nr_superblocks)
return false;
if (!percpu_ref_tryget(&ca->io_ref))
return false;
sb->offset = sb->layout.sb_offset[idx]; sb->offset = sb->layout.sb_offset[idx];
SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum); SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
@ -734,21 +729,23 @@ static bool write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
bch2_bio_map(bio, sb); bch2_bio_map(bio, sb);
percpu_ref_get(&ca->io_ref);
closure_bio_submit(bio, &c->sb_write); closure_bio_submit(bio, &c->sb_write);
return true;
} }
void bch2_write_super(struct bch_fs *c) void bch2_write_super(struct bch_fs *c)
{ {
struct closure *cl = &c->sb_write; struct closure *cl = &c->sb_write;
struct bch_dev *ca; struct bch_dev *ca;
unsigned i, super_idx = 0; unsigned i, sb = 0, nr_wrote;
const char *err; const char *err;
bool wrote; struct bch_devs_mask sb_written;
bool wrote, can_mount_without_written, can_mount_with_written;
lockdep_assert_held(&c->sb_lock); lockdep_assert_held(&c->sb_lock);
closure_init_stack(cl); closure_init_stack(cl);
memset(&sb_written, 0, sizeof(sb_written));
le64_add_cpu(&c->disk_sb->seq, 1); le64_add_cpu(&c->disk_sb->seq, 1);
@ -767,15 +764,53 @@ void bch2_write_super(struct bch_fs *c)
test_bit(BCH_FS_ERROR, &c->flags)) test_bit(BCH_FS_ERROR, &c->flags))
goto out; goto out;
for_each_online_member(ca, c, i) {
__set_bit(ca->dev_idx, sb_written.d);
ca->sb_write_error = 0;
}
do { do {
wrote = false; wrote = false;
for_each_online_member(ca, c, i) for_each_online_member(ca, c, i)
if (write_one_super(c, ca, super_idx)) if (sb < ca->disk_sb.sb->layout.nr_superblocks) {
write_one_super(c, ca, sb);
wrote = true; wrote = true;
}
closure_sync(cl); closure_sync(cl);
super_idx++; sb++;
} while (wrote); } while (wrote);
for_each_online_member(ca, c, i)
if (ca->sb_write_error)
__clear_bit(ca->dev_idx, sb_written.d);
nr_wrote = bitmap_weight(sb_written.d, BCH_SB_MEMBERS_MAX);
can_mount_with_written =
bch2_have_enough_devs(c,
__bch2_replicas_status(c, sb_written),
BCH_FORCE_IF_DEGRADED);
for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
sb_written.d[i] = ~sb_written.d[i];
can_mount_without_written =
bch2_have_enough_devs(c,
__bch2_replicas_status(c, sb_written),
BCH_FORCE_IF_DEGRADED);
/*
* If we would be able to mount _without_ the devices we successfully
* wrote superblocks to, we weren't able to write to enough devices:
*
* Exception: if we can mount without the successes because we haven't
* written anything (new filesystem), we continue if we'd be able to
* mount with the devices we did successfully write to:
*/
bch2_fs_fatal_err_on(!nr_wrote ||
(can_mount_without_written &&
!can_mount_with_written), c,
"Unable to write superblock to sufficient devices");
out: out:
/* Make new options visible after they're persistent: */ /* Make new options visible after they're persistent: */
bch2_sb_update(c); bch2_sb_update(c);
@ -1087,7 +1122,7 @@ int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
} }
struct replicas_status __bch2_replicas_status(struct bch_fs *c, struct replicas_status __bch2_replicas_status(struct bch_fs *c,
struct bch_dev *dev_to_offline) struct bch_devs_mask online_devs)
{ {
struct bch_replicas_cpu_entry *e; struct bch_replicas_cpu_entry *e;
struct bch_replicas_cpu *r; struct bch_replicas_cpu *r;
@ -1114,8 +1149,7 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
if (!replicas_test_dev(e, dev)) if (!replicas_test_dev(e, dev))
continue; continue;
if (bch2_dev_is_online(c->devs[dev]) && if (test_bit(dev, online_devs.d))
c->devs[dev] != dev_to_offline)
nr_online++; nr_online++;
else else
nr_offline++; nr_offline++;
@ -1137,7 +1171,32 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
struct replicas_status bch2_replicas_status(struct bch_fs *c) struct replicas_status bch2_replicas_status(struct bch_fs *c)
{ {
return __bch2_replicas_status(c, NULL); return __bch2_replicas_status(c, bch2_online_devs(c));
}
bool bch2_have_enough_devs(struct bch_fs *c,
struct replicas_status s,
unsigned flags)
{
if ((s.replicas[BCH_DATA_JOURNAL].nr_offline ||
s.replicas[BCH_DATA_BTREE].nr_offline) &&
!(flags & BCH_FORCE_IF_METADATA_DEGRADED))
return false;
if ((!s.replicas[BCH_DATA_JOURNAL].nr_online ||
!s.replicas[BCH_DATA_BTREE].nr_online) &&
!(flags & BCH_FORCE_IF_METADATA_LOST))
return false;
if (s.replicas[BCH_DATA_USER].nr_offline &&
!(flags & BCH_FORCE_IF_DATA_DEGRADED))
return false;
if (!s.replicas[BCH_DATA_USER].nr_online &&
!(flags & BCH_FORCE_IF_DATA_LOST))
return false;
return true;
} }
unsigned bch2_replicas_online(struct bch_fs *c, bool meta) unsigned bch2_replicas_online(struct bch_fs *c, bool meta)

View File

@ -4,6 +4,7 @@
#include "extents.h" #include "extents.h"
#include "eytzinger.h" #include "eytzinger.h"
#include "super_types.h" #include "super_types.h"
#include "super.h"
#include <asm/byteorder.h> #include <asm/byteorder.h>
@ -134,8 +135,9 @@ struct replicas_status {
}; };
struct replicas_status __bch2_replicas_status(struct bch_fs *, struct replicas_status __bch2_replicas_status(struct bch_fs *,
struct bch_dev *); struct bch_devs_mask);
struct replicas_status bch2_replicas_status(struct bch_fs *); struct replicas_status bch2_replicas_status(struct bch_fs *);
bool bch2_have_enough_devs(struct bch_fs *, struct replicas_status, unsigned);
unsigned bch2_replicas_online(struct bch_fs *, bool); unsigned bch2_replicas_online(struct bch_fs *, bool);
unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);

View File

@ -11,6 +11,7 @@
#include "btree_cache.h" #include "btree_cache.h"
#include "btree_gc.h" #include "btree_gc.h"
#include "btree_update.h" #include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_io.h" #include "btree_io.h"
#include "chardev.h" #include "chardev.h"
#include "checksum.h" #include "checksum.h"
@ -416,7 +417,6 @@ static void bch2_fs_exit(struct bch_fs *c)
del_timer_sync(&c->foreground_write_wakeup); del_timer_sync(&c->foreground_write_wakeup);
cancel_delayed_work_sync(&c->pd_controllers_update); cancel_delayed_work_sync(&c->pd_controllers_update);
cancel_work_sync(&c->read_only_work); cancel_work_sync(&c->read_only_work);
cancel_work_sync(&c->read_retry_work);
for (i = 0; i < c->sb.nr_devices; i++) for (i = 0; i < c->sb.nr_devices; i++)
if (c->devs[i]) if (c->devs[i])
@ -519,10 +519,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mutex_init(&c->bio_bounce_pages_lock); mutex_init(&c->bio_bounce_pages_lock);
mutex_init(&c->zlib_workspace_lock); mutex_init(&c->zlib_workspace_lock);
bio_list_init(&c->read_retry_list);
spin_lock_init(&c->read_retry_lock);
INIT_WORK(&c->read_retry_work, bch2_read_retry_work);
bio_list_init(&c->btree_write_error_list); bio_list_init(&c->btree_write_error_list);
spin_lock_init(&c->btree_write_error_lock); spin_lock_init(&c->btree_write_error_lock);
INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work); INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work);
@ -584,7 +580,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1, mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
sizeof(struct btree_reserve)) || sizeof(struct btree_reserve)) ||
mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
sizeof(struct btree_interior_update)) || sizeof(struct btree_update)) ||
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
bioset_init(&c->btree_read_bio, 1, bioset_init(&c->btree_read_bio, 1,
offsetof(struct btree_read_bio, bio)) || offsetof(struct btree_read_bio, bio)) ||
@ -1120,7 +1116,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
spin_lock_init(&ca->freelist_lock); spin_lock_init(&ca->freelist_lock);
bch2_dev_moving_gc_init(ca); bch2_dev_moving_gc_init(ca);
INIT_WORK(&ca->io_error_work, bch2_nonfatal_io_error_work); INIT_WORK(&ca->io_error_work, bch2_io_error_work);
if (bch2_fs_init_fault("dev_alloc")) if (bch2_fs_init_fault("dev_alloc"))
goto err; goto err;
@ -1262,31 +1258,6 @@ static int __bch2_dev_online(struct bch_fs *c, struct bcache_superblock *sb)
/* Device management: */ /* Device management: */
static bool have_enough_devs(struct bch_fs *c,
struct replicas_status s,
unsigned flags)
{
if ((s.replicas[BCH_DATA_JOURNAL].nr_offline ||
s.replicas[BCH_DATA_BTREE].nr_offline) &&
!(flags & BCH_FORCE_IF_METADATA_DEGRADED))
return false;
if ((!s.replicas[BCH_DATA_JOURNAL].nr_online ||
!s.replicas[BCH_DATA_BTREE].nr_online) &&
!(flags & BCH_FORCE_IF_METADATA_LOST))
return false;
if (s.replicas[BCH_DATA_USER].nr_offline &&
!(flags & BCH_FORCE_IF_DATA_DEGRADED))
return false;
if (!s.replicas[BCH_DATA_USER].nr_online &&
!(flags & BCH_FORCE_IF_DATA_LOST))
return false;
return true;
}
/* /*
* Note: this function is also used by the error paths - when a particular * Note: this function is also used by the error paths - when a particular
* device sees an error, we call it to determine whether we can just set the * device sees an error, we call it to determine whether we can just set the
@ -1299,6 +1270,7 @@ static bool have_enough_devs(struct bch_fs *c,
bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
enum bch_member_state new_state, int flags) enum bch_member_state new_state, int flags)
{ {
struct bch_devs_mask new_online_devs;
struct replicas_status s; struct replicas_status s;
struct bch_dev *ca2; struct bch_dev *ca2;
int i, nr_rw = 0, required; int i, nr_rw = 0, required;
@ -1331,19 +1303,12 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
return true; return true;
/* do we have enough devices to read from? */ /* do we have enough devices to read from? */
s = __bch2_replicas_status(c, ca); new_online_devs = bch2_online_devs(c);
__clear_bit(ca->dev_idx, new_online_devs.d);
pr_info("replicas: j %u %u b %u %u d %u %u", s = __bch2_replicas_status(c, new_online_devs);
s.replicas[BCH_DATA_JOURNAL].nr_online,
s.replicas[BCH_DATA_JOURNAL].nr_offline,
s.replicas[BCH_DATA_BTREE].nr_online, return bch2_have_enough_devs(c, s, flags);
s.replicas[BCH_DATA_BTREE].nr_offline,
s.replicas[BCH_DATA_USER].nr_online,
s.replicas[BCH_DATA_USER].nr_offline);
return have_enough_devs(c, s, flags);
default: default:
BUG(); BUG();
} }
@ -1374,7 +1339,7 @@ static bool bch2_fs_may_start(struct bch_fs *c)
s = bch2_replicas_status(c); s = bch2_replicas_status(c);
return have_enough_devs(c, s, flags); return bch2_have_enough_devs(c, s, flags);
} }
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)

View File

@ -94,6 +94,18 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
__for_each_online_member(ca, c, iter, \ __for_each_online_member(ca, c, iter, \
(1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO)) (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO))
static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
{
struct bch_devs_mask devs;
struct bch_dev *ca;
unsigned i;
memset(&devs, 0, sizeof(devs));
for_each_online_member(ca, c, i)
__set_bit(ca->dev_idx, devs.d);
return devs;
}
struct bch_fs *bch2_bdev_to_fs(struct block_device *); struct bch_fs *bch2_bdev_to_fs(struct block_device *);
struct bch_fs *bch2_uuid_to_fs(uuid_le); struct bch_fs *bch2_uuid_to_fs(uuid_le);
int bch2_congested(struct bch_fs *, int); int bch2_congested(struct bch_fs *, int);

View File

@ -9,4 +9,8 @@ struct bcache_superblock {
fmode_t mode; fmode_t mode;
}; };
struct bch_devs_mask {
unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
};
#endif /* _BCACHE_SUPER_TYPES_H */ #endif /* _BCACHE_SUPER_TYPES_H */

View File

@ -232,24 +232,36 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
return scnprintf(buf, PAGE_SIZE, return scnprintf(buf, PAGE_SIZE,
"capacity:\t\t%llu\n" "capacity:\t\t%llu\n"
"compressed:\n" "1 replicas:\n"
"\tmeta:\t\t%llu\n" "\tmeta:\t\t%llu\n"
"\tdirty:\t\t%llu\n" "\tdirty:\t\t%llu\n"
"\tcached:\t\t%llu\n" "\treserved:\t%llu\n"
"uncompressed:\n" "2 replicas:\n"
"\tmeta:\t\t%llu\n" "\tmeta:\t\t%llu\n"
"\tdirty:\t\t%llu\n" "\tdirty:\t\t%llu\n"
"\tcached:\t\t%llu\n" "\treserved:\t%llu\n"
"persistent reserved sectors:\t%llu\n" "3 replicas:\n"
"online reserved sectors:\t%llu\n", "\tmeta:\t\t%llu\n"
"\tdirty:\t\t%llu\n"
"\treserved:\t%llu\n"
"4 replicas:\n"
"\tmeta:\t\t%llu\n"
"\tdirty:\t\t%llu\n"
"\treserved:\t%llu\n"
"online reserved:\t%llu\n",
c->capacity, c->capacity,
stats.s[S_COMPRESSED][S_META], stats.s[0].data[S_META],
stats.s[S_COMPRESSED][S_DIRTY], stats.s[0].data[S_DIRTY],
stats.s[S_COMPRESSED][S_CACHED], stats.s[0].persistent_reserved,
stats.s[S_UNCOMPRESSED][S_META], stats.s[1].data[S_META],
stats.s[S_UNCOMPRESSED][S_DIRTY], stats.s[1].data[S_DIRTY],
stats.s[S_UNCOMPRESSED][S_CACHED], stats.s[1].persistent_reserved,
stats.persistent_reserved, stats.s[2].data[S_META],
stats.s[2].data[S_DIRTY],
stats.s[2].persistent_reserved,
stats.s[3].data[S_META],
stats.s[3].data[S_DIRTY],
stats.s[3].persistent_reserved,
stats.online_reserved); stats.online_reserved);
} }
@ -708,8 +720,8 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
stats.buckets_alloc, ca->mi.nbuckets - ca->mi.first_bucket, stats.buckets_alloc, ca->mi.nbuckets - ca->mi.first_bucket,
stats.buckets_meta, ca->mi.nbuckets - ca->mi.first_bucket, stats.buckets[S_META], ca->mi.nbuckets - ca->mi.first_bucket,
stats.buckets_dirty, ca->mi.nbuckets - ca->mi.first_bucket, stats.buckets[S_DIRTY], ca->mi.nbuckets - ca->mi.first_bucket,
__dev_buckets_available(ca, stats), ca->mi.nbuckets - ca->mi.first_bucket, __dev_buckets_available(ca, stats), ca->mi.nbuckets - ca->mi.first_bucket,
c->freelist_wait.list.first ? "waiting" : "empty", c->freelist_wait.list.first ? "waiting" : "empty",
c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE, c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE,
@ -749,11 +761,11 @@ SHOW(bch2_dev)
sysfs_hprint(dirty_data, stats.sectors[S_DIRTY] << 9); sysfs_hprint(dirty_data, stats.sectors[S_DIRTY] << 9);
sysfs_print(dirty_bytes, stats.sectors[S_DIRTY] << 9); sysfs_print(dirty_bytes, stats.sectors[S_DIRTY] << 9);
sysfs_print(dirty_buckets, stats.buckets_dirty); sysfs_print(dirty_buckets, stats.buckets[S_DIRTY]);
sysfs_hprint(cached_data, stats.sectors[S_CACHED] << 9); sysfs_hprint(cached_data, stats.sectors_cached << 9);
sysfs_print(cached_bytes, stats.sectors[S_CACHED] << 9); sysfs_print(cached_bytes, stats.sectors_cached << 9);
sysfs_print(cached_buckets, stats.buckets_cached); sysfs_print(cached_buckets, stats.buckets_cached);
sysfs_print(meta_buckets, stats.buckets_meta); sysfs_print(meta_buckets, stats.buckets[S_META]);
sysfs_print(alloc_buckets, stats.buckets_alloc); sysfs_print(alloc_buckets, stats.buckets_alloc);
sysfs_print(available_buckets, dev_buckets_available(ca)); sysfs_print(available_buckets, dev_buckets_available(ca));
sysfs_print(free_buckets, dev_buckets_free(ca)); sysfs_print(free_buckets, dev_buckets_free(ca));