Update bcachefs sources to 3b4024f944

This commit is contained in:
Kent Overstreet 2017-04-10 21:19:15 -08:00
parent e394bd4ba3
commit 03bc9d71b1
13 changed files with 230 additions and 376 deletions

View File

@ -1 +1 @@
da037866e669b09edc6b049ce09535d3456474cb
3b4024f94489e4d8dc8eb7f1278754a2545f8026

View File

@ -754,7 +754,7 @@ struct bch_fs {
unsigned bucket_journal_seq;
/* The rest of this all shows up in sysfs */
atomic_long_t cache_read_races;
atomic_long_t read_realloc_races;
unsigned foreground_write_ratelimit_enabled:1;
unsigned copy_gc_enabled:1;

View File

@ -1630,82 +1630,19 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
}
}
/*
* Write all dirty btree nodes to disk, including roots
*/
void bch2_btree_flush(struct bch_fs *c)
void bch2_btree_verify_flushed(struct bch_fs *c)
{
struct closure cl;
struct btree *b;
struct bucket_table *tbl;
struct rhash_head *pos;
bool saw_dirty;
struct btree *b;
unsigned i;
closure_init_stack(&cl);
rcu_read_lock();
tbl = rht_dereference_rcu(c->btree_cache_table.tbl,
&c->btree_cache_table);
do {
saw_dirty = false;
i = 0;
restart:
tbl = rht_dereference_rcu(c->btree_cache_table.tbl,
&c->btree_cache_table);
for (; i < tbl->size; i++)
rht_for_each_entry_rcu(b, pos, tbl, i, hash) {
saw_dirty |= btree_node_dirty(b);
if (btree_node_dirty(b) &&
btree_node_may_write(b)) {
rcu_read_unlock();
six_lock_read(&b->lock);
bch2_btree_node_write_dirty(c, b, &cl, 1);
six_unlock_read(&b->lock);
rcu_read_lock();
goto restart;
}
}
} while (saw_dirty);
for (i = 0; i < tbl->size; i++)
rht_for_each_entry_rcu(b, pos, tbl, i, hash)
BUG_ON(btree_node_dirty(b));
rcu_read_unlock();
closure_sync(&cl);
}
/**
* bch_btree_node_flush_journal - flush any journal entries that contain keys
* from this node
*
* The bset's journal sequence number is used for preserving ordering of index
* updates across unclean shutdowns - it's used to ignore bsets newer than the
* most recent journal entry.
*
* But when rewriting btree nodes we compact all the bsets in a btree node - and
* if we compacted a bset that should be ignored with bsets we do need, that
* would be bad. So to avoid that, prior to making the new node visible ensure
* that the journal has been flushed so that all the bsets we compacted should
* be visible.
*/
void bch2_btree_node_flush_journal_entries(struct bch_fs *c,
struct btree *b,
struct closure *cl)
{
int i = b->nsets;
/*
* Journal sequence numbers in the different bsets will always be in
* ascending order, we only need to flush the highest - except that the
* most recent bset might not have a journal sequence number yet, so we
* need to loop:
*/
while (i--) {
u64 seq = le64_to_cpu(bset(b, &b->set[i])->journal_seq);
if (seq) {
bch2_journal_flush_seq_async(&c->journal, seq, cl);
break;
}
}
}

View File

@ -94,8 +94,6 @@ do { \
} \
} while (0)
void bch2_btree_flush(struct bch_fs *);
void bch2_btree_node_flush_journal_entries(struct bch_fs *, struct btree *,
struct closure *);
void bch2_btree_verify_flushed(struct bch_fs *);
#endif /* _BCACHE_BTREE_IO_H */

View File

@ -161,15 +161,14 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
{
trace_btree_node_free(c, b);
BUG_ON(btree_node_dirty(b));
BUG_ON(b == btree_node_root(c, b));
BUG_ON(b->ob);
BUG_ON(!list_empty(&b->write_blocked));
six_lock_write(&b->lock);
clear_btree_node_noevict(b);
if (btree_node_dirty(b))
bch2_btree_complete_write(c, b, btree_current_write(b));
clear_btree_node_dirty(b);
six_lock_write(&b->lock);
bch2_btree_node_hash_remove(c, b);
@ -192,6 +191,8 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
b->ob = NULL;
clear_btree_node_dirty(b);
__btree_node_free(c, b, NULL);
bch2_open_bucket_put(c, ob);
@ -890,7 +891,8 @@ bch2_btree_interior_update_alloc(struct bch_fs *c)
static void btree_interior_update_free(struct closure *cl)
{
struct btree_interior_update *as = container_of(cl, struct btree_interior_update, cl);
struct btree_interior_update *as =
container_of(cl, struct btree_interior_update, cl);
mempool_free(as, &as->c->btree_interior_update_pool);
}
@ -910,9 +912,6 @@ static void btree_interior_update_nodes_reachable(struct closure *cl)
bch2_btree_node_free_ondisk(c, &as->pending[i]);
as->nr_pending = 0;
mutex_unlock(&c->btree_interior_update_lock);
mutex_lock(&c->btree_interior_update_lock);
list_del(&as->list);
mutex_unlock(&c->btree_interior_update_lock);
@ -1039,6 +1038,15 @@ static void btree_interior_update_updated_btree(struct bch_fs *c,
system_freezable_wq);
}
static void btree_interior_update_reparent(struct btree_interior_update *as,
struct btree_interior_update *child)
{
child->b = NULL;
child->mode = BTREE_INTERIOR_UPDATING_AS;
child->parent_as = as;
closure_get(&as->cl);
}
static void btree_interior_update_updated_root(struct bch_fs *c,
struct btree_interior_update *as,
enum btree_id btree_id)
@ -1053,14 +1061,8 @@ static void btree_interior_update_updated_root(struct bch_fs *c,
* Old root might not be persistent yet - if so, redirect its
* btree_interior_update operation to point to us:
*/
if (r->as) {
BUG_ON(r->as->mode != BTREE_INTERIOR_UPDATING_ROOT);
r->as->b = NULL;
r->as->mode = BTREE_INTERIOR_UPDATING_AS;
r->as->parent_as = as;
closure_get(&as->cl);
}
if (r->as)
btree_interior_update_reparent(as, r->as);
as->mode = BTREE_INTERIOR_UPDATING_ROOT;
as->b = r->b;
@ -1068,8 +1070,6 @@ static void btree_interior_update_updated_root(struct bch_fs *c,
mutex_unlock(&c->btree_interior_update_lock);
bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
continue_at(&as->cl, btree_interior_update_nodes_written,
system_freezable_wq);
}
@ -1092,8 +1092,10 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
struct btree_interior_update *as,
struct btree *b)
{
struct closure *cl, *cl_n;
struct btree_interior_update *p, *n;
struct pending_btree_node_free *d;
struct btree_write *w;
struct bset_tree *t;
/*
@ -1107,41 +1109,8 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
for_each_bset(b, t)
as->journal_seq = max(as->journal_seq, bset(b, t)->journal_seq);
/*
* Does this node have unwritten data that has a pin on the journal?
*
* If so, transfer that pin to the btree_interior_update operation -
* note that if we're freeing multiple nodes, we only need to keep the
* oldest pin of any of the nodes we're freeing. We'll release the pin
* when the new nodes are persistent and reachable on disk:
*/
bch2_journal_pin_add_if_older(&c->journal,
&b->writes[0].journal,
&as->journal, interior_update_flush);
bch2_journal_pin_add_if_older(&c->journal,
&b->writes[1].journal,
&as->journal, interior_update_flush);
mutex_lock(&c->btree_interior_update_lock);
/*
* Does this node have any btree_interior_update operations preventing
* it from being written?
*
* If so, redirect them to point to this btree_interior_update: we can
* write out our new nodes, but we won't make them visible until those
* operations complete
*/
list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
BUG_ON(p->mode != BTREE_INTERIOR_UPDATING_NODE);
p->mode = BTREE_INTERIOR_UPDATING_AS;
list_del(&p->write_blocked_list);
p->b = NULL;
p->parent_as = as;
closure_get(&as->cl);
}
/* Add this node to the list of nodes being freed: */
BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
@ -1152,6 +1121,38 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
d->level = b->level;
bkey_copy(&d->key, &b->key);
/*
* Does this node have any btree_interior_update operations preventing
* it from being written?
*
* If so, redirect them to point to this btree_interior_update: we can
* write out our new nodes, but we won't make them visible until those
* operations complete
*/
list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
list_del(&p->write_blocked_list);
btree_interior_update_reparent(as, p);
}
clear_btree_node_dirty(b);
w = btree_current_write(b);
llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
llist_add(&cl->list, &as->wait.list);
/*
* Does this node have unwritten data that has a pin on the journal?
*
* If so, transfer that pin to the btree_interior_update operation -
* note that if we're freeing multiple nodes, we only need to keep the
* oldest pin of any of the nodes we're freeing. We'll release the pin
* when the new nodes are persistent and reachable on disk:
*/
bch2_journal_pin_add_if_older(&c->journal, &w->journal,
&as->journal, interior_update_flush);
bch2_journal_pin_drop(&c->journal, &w->journal);
mutex_unlock(&c->btree_interior_update_lock);
}

View File

@ -199,21 +199,6 @@ static inline u64 bch2_fs_sectors_used(struct bch_fs *c)
return min(c->capacity, __bch2_fs_sectors_used(c));
}
/* XXX: kill? */
static inline u64 sectors_available(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
u64 ret = 0;
rcu_read_lock();
for_each_member_device_rcu(ca, c, i)
ret += dev_buckets_available(ca) << ca->bucket_bits;
rcu_read_unlock();
return ret;
}
static inline bool is_available_bucket(struct bucket_mark mark)
{
return (!mark.owned_by_allocator &&

View File

@ -37,10 +37,10 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
path = strndup_user((const char __user *)
(unsigned long) dev, PATH_MAX);
if (!path)
return ERR_PTR(-ENOMEM);
if (IS_ERR(path))
return ERR_CAST(path);
bdev = lookup_bdev(strim(path));
bdev = lookup_bdev(path);
kfree(path);
if (IS_ERR(bdev))
return ERR_CAST(bdev);

View File

@ -1046,7 +1046,7 @@ static void bch2_read_endio(struct bio *bio)
if (rbio->ptr.cached &&
(((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
ptr_stale(rbio->ca, &rbio->ptr))) {
atomic_long_inc(&c->cache_read_races);
atomic_long_inc(&c->read_realloc_races);
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
bch2_rbio_retry(c, rbio);

View File

@ -180,8 +180,10 @@ redo_peek:
ret == -EINTR)
goto redo_peek;
/* -EROFS or perhaps -ENOSPC - bail out: */
/* XXX warn here */
bch2_fs_fatal_error(c,
"error %i rewriting btree node with blacklisted journal seq",
ret);
bch2_journal_halt(j);
return;
}
}
@ -1018,6 +1020,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
fifo_for_each_entry_ptr(p, &j->pin, iter) {
INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, 0);
}
@ -1147,6 +1150,7 @@ static void __journal_entry_new(struct journal *j, int count)
&fifo_peek_back(&j->pin));
INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, count);
}
@ -1516,7 +1520,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
j->replay_pin_list = NULL;
if (did_replay) {
bch2_btree_flush(c);
bch2_journal_flush_pins(&c->journal, U64_MAX);
/*
* Write a new journal entry _before_ we start journalling new data -
@ -1859,7 +1863,7 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
struct journal_entry_pin, list);
if (ret) {
/* must be list_del_init(), see bch2_journal_pin_drop() */
list_del_init(&ret->list);
list_move(&ret->list, &pin_list->flushed);
*seq = journal_pin_seq(j, pin_list);
break;
}
@ -1869,28 +1873,32 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
return ret;
}
static bool journal_has_pins(struct journal *j)
static bool journal_flush_done(struct journal *j, u64 seq_to_flush)
{
bool ret;
spin_lock(&j->lock);
journal_reclaim_fast(j);
ret = fifo_used(&j->pin) > 1 ||
atomic_read(&fifo_peek_front(&j->pin).count) > 1;
ret = (fifo_used(&j->pin) == 1 &&
atomic_read(&fifo_peek_front(&j->pin).count) == 1) ||
last_seq(j) > seq_to_flush;
spin_unlock(&j->lock);
return ret;
}
void bch2_journal_flush_pins(struct journal *j)
void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
{
struct journal_entry_pin *pin;
u64 seq;
u64 pin_seq;
while ((pin = journal_get_next_pin(j, U64_MAX, &seq)))
pin->flush(j, pin, seq);
while ((pin = journal_get_next_pin(j, seq_to_flush, &pin_seq)))
pin->flush(j, pin, pin_seq);
wait_event(j->wait, !journal_has_pins(j) || bch2_journal_error(j));
wait_event(j->wait,
journal_flush_done(j, seq_to_flush) ||
bch2_journal_error(j));
}
static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
@ -2174,9 +2182,18 @@ static void journal_write_done(struct closure *cl)
struct journal *j = container_of(cl, struct journal, io);
struct journal_buf *w = journal_prev_buf(j);
__bch2_time_stats_update(j->write_time, j->write_start_time);
j->last_seq_ondisk = le64_to_cpu(w->data->last_seq);
__bch2_time_stats_update(j->write_time, j->write_start_time);
/*
* Updating last_seq_ondisk may let journal_reclaim_work() discard more
* buckets:
*
* Must come before signaling write completion, for
* bch2_fs_journal_stop():
*/
mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
BUG_ON(!j->reservations.prev_buf_unwritten);
atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
@ -2199,12 +2216,6 @@ static void journal_write_done(struct closure *cl)
closure_wake_up(&w->wait);
wake_up(&j->wait);
/*
* Updating last_seq_ondisk may let journal_reclaim_work() discard more
* buckets:
*/
mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
}
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
@ -2345,8 +2356,12 @@ static void journal_write_work(struct work_struct *work)
struct journal *j = container_of(to_delayed_work(work),
struct journal, write_work);
spin_lock(&j->lock);
set_bit(JOURNAL_NEED_WRITE, &j->flags);
if (!journal_entry_is_open(j)) {
spin_unlock(&j->lock);
return;
}
set_bit(JOURNAL_NEED_WRITE, &j->flags);
if (journal_buf_switch(j, false) != JOURNAL_UNLOCKED)
spin_unlock(&j->lock);
}
@ -2505,6 +2520,8 @@ void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent
void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent)
{
struct journal_buf *buf;
spin_lock(&j->lock);
BUG_ON(seq > atomic64_read(&j->seq));
@ -2517,8 +2534,9 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa
if (seq == atomic64_read(&j->seq)) {
bool set_need_write = false;
if (parent &&
!closure_wait(&journal_cur_buf(j)->wait, parent))
buf = journal_cur_buf(j);
if (parent && !closure_wait(&buf->wait, parent))
BUG();
if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
@ -2529,7 +2547,7 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa
switch (journal_buf_switch(j, set_need_write)) {
case JOURNAL_ENTRY_ERROR:
if (parent)
closure_wake_up(&journal_cur_buf(j)->wait);
closure_wake_up(&buf->wait);
break;
case JOURNAL_ENTRY_CLOSED:
/*
@ -2545,7 +2563,9 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa
} else if (parent &&
seq + 1 == atomic64_read(&j->seq) &&
j->reservations.prev_buf_unwritten) {
if (!closure_wait(&journal_prev_buf(j)->wait, parent))
buf = journal_prev_buf(j);
if (!closure_wait(&buf->wait, parent))
BUG();
smp_mb();
@ -2553,7 +2573,7 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa
/* check if raced with write completion (or failure) */
if (!j->reservations.prev_buf_unwritten ||
bch2_journal_error(j))
closure_wake_up(&journal_prev_buf(j)->wait);
closure_wake_up(&buf->wait);
}
spin_unlock(&j->lock);
@ -2698,6 +2718,39 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
return ret;
}
ssize_t bch2_journal_print_pins(struct journal *j, char *buf)
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *pin;
ssize_t ret = 0;
unsigned i;
spin_lock_irq(&j->pin_lock);
fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
ret += scnprintf(buf + ret, PAGE_SIZE - ret,
"%llu: count %u\n",
journal_pin_seq(j, pin_list),
atomic_read(&pin_list->count));
list_for_each_entry(pin, &pin_list->list, list)
ret += scnprintf(buf + ret, PAGE_SIZE - ret,
"\t%p %pf\n",
pin, pin->flush);
if (!list_empty(&pin_list->flushed))
ret += scnprintf(buf + ret, PAGE_SIZE - ret,
"flushed:\n");
list_for_each_entry(pin, &pin_list->flushed, list)
ret += scnprintf(buf + ret, PAGE_SIZE - ret,
"\t%p %pf\n",
pin, pin->flush);
}
spin_unlock_irq(&j->pin_lock);
return ret;
}
static bool bch2_journal_writing_to_device(struct bch_dev *ca)
{
struct journal *j = &ca->fs->journal;
@ -2725,12 +2778,11 @@ static bool bch2_journal_writing_to_device(struct bch_dev *ca)
int bch2_journal_move(struct bch_dev *ca)
{
u64 last_flushed_seq;
struct journal_device *ja = &ca->journal;
struct bch_fs *c = ca->fs;
struct journal *j = &c->journal;
struct journal *j = &ca->fs->journal;
u64 seq_to_flush = 0;
unsigned i;
int ret = 0; /* Success */
int ret;
if (bch2_journal_writing_to_device(ca)) {
/*
@ -2744,16 +2796,10 @@ int bch2_journal_move(struct bch_dev *ca)
BUG_ON(bch2_journal_writing_to_device(ca));
}
/*
* Flush all btree updates to backing store so that any
* journal entries written to ca become stale and are no
* longer needed.
*/
for (i = 0; i < ja->nr; i++)
seq_to_flush = max(seq_to_flush, ja->bucket_seq[i]);
/*
* XXX: switch to normal journal reclaim machinery
*/
bch2_btree_flush(c);
bch2_journal_flush_pins(j, seq_to_flush);
/*
* Force a meta-data journal entry to be written so that
@ -2767,12 +2813,9 @@ int bch2_journal_move(struct bch_dev *ca)
* the device
*/
spin_lock(&j->lock);
last_flushed_seq = last_seq(j);
ret = j->last_seq_ondisk > seq_to_flush ? 0 : -EIO;
spin_unlock(&j->lock);
for (i = 0; i < ja->nr; i += 1)
BUG_ON(ja->bucket_seq[i] > last_flushed_seq);
return ret;
}
@ -2786,7 +2829,7 @@ void bch2_fs_journal_stop(struct journal *j)
* journal entries, then force a brand new empty journal entry to be
* written:
*/
bch2_journal_flush_pins(j);
bch2_journal_flush_pins(j, U64_MAX);
bch2_journal_flush_async(j, NULL);
bch2_journal_meta(j);

View File

@ -141,7 +141,7 @@ void bch2_journal_pin_add_if_older(struct journal *,
struct journal_entry_pin *,
struct journal_entry_pin *,
journal_pin_flush_fn);
void bch2_journal_flush_pins(struct journal *);
void bch2_journal_flush_pins(struct journal *, u64);
struct closure;
struct bch_fs;
@ -354,6 +354,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
}
ssize_t bch2_journal_print_debug(struct journal *, char *);
ssize_t bch2_journal_print_pins(struct journal *, char *);
int bch2_dev_journal_alloc(struct bch_dev *);

View File

@ -38,6 +38,7 @@ struct journal_buf {
struct journal_entry_pin_list {
struct list_head list;
struct list_head flushed;
atomic_t count;
};

View File

@ -211,7 +211,14 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch2_gc_thread_stop(c);
bch2_btree_flush(c);
/*
* Flush journal before stopping allocators, because flushing journal
* blacklist entries involves allocating new btree nodes:
*/
bch2_journal_flush_pins(&c->journal, U64_MAX);
if (!bch2_journal_error(&c->journal))
bch2_btree_verify_flushed(c);
for_each_member_device(ca, c, i)
bch2_dev_allocator_stop(ca);

View File

@ -120,6 +120,7 @@ do { \
return strtoi_h(buf, &var) ?: (ssize_t) size; \
} while (0)
write_attribute(trigger_journal_flush);
write_attribute(trigger_btree_coalesce);
write_attribute(trigger_gc);
write_attribute(prune_cache);
@ -127,35 +128,25 @@ write_attribute(prune_cache);
read_attribute(uuid);
read_attribute(minor);
read_attribute(bucket_size);
read_attribute(bucket_size_bytes);
read_attribute(block_size);
read_attribute(block_size_bytes);
read_attribute(btree_node_size);
read_attribute(btree_node_size_bytes);
read_attribute(first_bucket);
read_attribute(nbuckets);
read_attribute(tree_depth);
read_attribute(root_usage_percent);
read_attribute(read_priority_stats);
read_attribute(write_priority_stats);
read_attribute(fragmentation_stats);
read_attribute(oldest_gen_stats);
read_attribute(reserve_stats);
read_attribute(btree_cache_size);
read_attribute(cache_available_percent);
read_attribute(compression_stats);
read_attribute(written);
read_attribute(btree_written);
read_attribute(metadata_written);
read_attribute(journal_debug);
write_attribute(journal_flush);
read_attribute(journal_pins);
read_attribute(internal_uuid);
read_attribute(btree_gc_running);
read_attribute(btree_nodes);
read_attribute(btree_used_percent);
read_attribute(average_key_size);
read_attribute(available_buckets);
read_attribute(free_buckets);
read_attribute(dirty_data);
@ -168,10 +159,9 @@ read_attribute(meta_buckets);
read_attribute(alloc_buckets);
read_attribute(has_data);
read_attribute(has_metadata);
read_attribute(bset_tree_stats);
read_attribute(alloc_debug);
read_attribute(cache_read_races);
read_attribute(read_realloc_races);
rw_attribute(journal_write_delay_ms);
rw_attribute(journal_reclaim_delay_ms);
@ -221,73 +211,6 @@ static struct attribute sysfs_state_rw = {
.mode = S_IRUGO
};
static int bch2_bset_print_stats(struct bch_fs *c, char *buf)
{
struct bset_stats stats;
size_t nodes = 0;
struct btree *b;
struct bucket_table *tbl;
struct rhash_head *pos;
unsigned iter;
memset(&stats, 0, sizeof(stats));
rcu_read_lock();
for_each_cached_btree(b, c, tbl, iter, pos) {
bch2_btree_keys_stats(b, &stats);
nodes++;
}
rcu_read_unlock();
return snprintf(buf, PAGE_SIZE,
"btree nodes: %zu\n"
"written sets: %zu\n"
"written key bytes: %zu\n"
"unwritten sets: %zu\n"
"unwritten key bytes: %zu\n"
"no table sets: %zu\n"
"no table key bytes: %zu\n"
"floats: %zu\n"
"failed unpacked: %zu\n"
"failed prev: %zu\n"
"failed overflow: %zu\n",
nodes,
stats.sets[BSET_RO_AUX_TREE].nr,
stats.sets[BSET_RO_AUX_TREE].bytes,
stats.sets[BSET_RW_AUX_TREE].nr,
stats.sets[BSET_RW_AUX_TREE].bytes,
stats.sets[BSET_NO_AUX_TREE].nr,
stats.sets[BSET_NO_AUX_TREE].bytes,
stats.floats,
stats.failed_unpacked,
stats.failed_prev,
stats.failed_overflow);
}
static unsigned bch2_root_usage(struct bch_fs *c)
{
unsigned bytes = 0;
struct bkey_packed *k;
struct btree *b;
struct btree_node_iter iter;
goto lock_root;
do {
six_unlock_read(&b->lock);
lock_root:
b = c->btree_roots[BTREE_ID_EXTENTS].b;
six_lock_read(&b->lock);
} while (b != c->btree_roots[BTREE_ID_EXTENTS].b);
for_each_btree_node_key(b, k, &iter, btree_node_is_extents(b))
bytes += bkey_bytes(k);
six_unlock_read(&b->lock);
return (bytes * 100) / btree_bytes(c);
}
static size_t bch2_btree_cache_size(struct bch_fs *c)
{
size_t ret = 0;
@ -301,27 +224,6 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
return ret;
}
static unsigned bch2_fs_available_percent(struct bch_fs *c)
{
return div64_u64((u64) sectors_available(c) * 100,
c->capacity ?: 1);
}
#if 0
static unsigned bch2_btree_used(struct bch_fs *c)
{
return div64_u64(c->gc_stats.key_bytes * 100,
(c->gc_stats.nodes ?: 1) * btree_bytes(c));
}
static unsigned bch2_average_key_size(struct bch_fs *c)
{
return c->gc_stats.nkeys
? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
: 0;
}
#endif
static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
{
struct bch_fs_usage stats = bch2_fs_usage_read(c);
@ -358,6 +260,9 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
compressed_sectors_compressed = 0,
compressed_sectors_uncompressed = 0;
if (!bch2_fs_running(c))
return -EPERM;
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, k)
if (k.k->type == BCH_EXTENT) {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
@ -402,29 +307,17 @@ SHOW(bch2_fs)
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
sysfs_print(minor, c->minor);
sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms);
sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
sysfs_hprint(block_size, block_bytes(c));
sysfs_print(block_size_bytes, block_bytes(c));
sysfs_hprint(btree_node_size, c->sb.btree_node_size << 9);
sysfs_print(btree_node_size_bytes, c->sb.btree_node_size << 9);
sysfs_print(block_size, block_bytes(c));
sysfs_print(btree_node_size, btree_bytes(c));
sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
sysfs_print(cache_available_percent, bch2_fs_available_percent(c));
sysfs_print(btree_gc_running, c->gc_pos.phase != GC_PHASE_DONE);
#if 0
/* XXX: reimplement */
sysfs_print(btree_used_percent, bch2_btree_used(c));
sysfs_print(btree_nodes, c->gc_stats.nodes);
sysfs_hprint(average_key_size, bch2_average_key_size(c));
#endif
sysfs_print(cache_read_races,
atomic_long_read(&c->cache_read_races));
sysfs_print(read_realloc_races,
atomic_long_read(&c->read_realloc_races));
sysfs_printf(foreground_write_ratelimit_enabled, "%i",
c->foreground_write_ratelimit_enabled);
@ -445,28 +338,21 @@ SHOW(bch2_fs)
/* Debugging: */
if (attr == &sysfs_journal_debug)
return bch2_journal_print_debug(&c->journal, buf);
#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
BCH_DEBUG_PARAMS()
#undef BCH_DEBUG_PARAM
if (!bch2_fs_running(c))
return -EPERM;
if (attr == &sysfs_bset_tree_stats)
return bch2_bset_print_stats(c, buf);
if (attr == &sysfs_alloc_debug)
return show_fs_alloc_debug(c, buf);
sysfs_print(tree_depth, c->btree_roots[BTREE_ID_EXTENTS].b->level);
sysfs_print(root_usage_percent, bch2_root_usage(c));
if (attr == &sysfs_journal_debug)
return bch2_journal_print_debug(&c->journal, buf);
if (attr == &sysfs_journal_pins)
return bch2_journal_print_pins(&c->journal, buf);
if (attr == &sysfs_compression_stats)
return bch2_compression_stats(c, buf);
sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
BCH_DEBUG_PARAMS()
#undef BCH_DEBUG_PARAM
return 0;
}
@ -519,17 +405,14 @@ STORE(__bch2_fs)
if (!bch2_fs_running(c))
return -EPERM;
if (attr == &sysfs_journal_flush) {
bch2_journal_meta_async(&c->journal, NULL);
/* Debugging: */
return size;
}
if (attr == &sysfs_trigger_journal_flush)
bch2_journal_meta_async(&c->journal, NULL);
if (attr == &sysfs_trigger_btree_coalesce)
bch2_coalesce(c);
/* Debugging: */
if (attr == &sysfs_trigger_gc)
bch2_gc(c);
@ -557,28 +440,21 @@ STORE(bch2_fs)
SYSFS_OPS(bch2_fs);
struct attribute *bch2_fs_files[] = {
&sysfs_journal_write_delay_ms,
&sysfs_journal_reclaim_delay_ms,
&sysfs_minor,
&sysfs_block_size,
&sysfs_block_size_bytes,
&sysfs_btree_node_size,
&sysfs_btree_node_size_bytes,
&sysfs_tree_depth,
&sysfs_root_usage_percent,
&sysfs_btree_cache_size,
&sysfs_cache_available_percent,
&sysfs_compression_stats,
&sysfs_average_key_size,
&sysfs_meta_replicas_have,
&sysfs_data_replicas_have,
&sysfs_journal_write_delay_ms,
&sysfs_journal_reclaim_delay_ms,
&sysfs_foreground_target_percent,
&sysfs_tiering_percent,
&sysfs_journal_flush,
&sysfs_compression_stats,
NULL
};
@ -598,21 +474,17 @@ STORE(bch2_fs_internal)
SYSFS_OPS(bch2_fs_internal);
struct attribute *bch2_fs_internal_files[] = {
&sysfs_journal_debug,
&sysfs_alloc_debug,
&sysfs_journal_debug,
&sysfs_journal_pins,
&sysfs_btree_gc_running,
&sysfs_btree_nodes,
&sysfs_btree_used_percent,
&sysfs_bset_tree_stats,
&sysfs_cache_read_races,
&sysfs_read_realloc_races,
&sysfs_trigger_journal_flush,
&sysfs_trigger_btree_coalesce,
&sysfs_trigger_gc,
&sysfs_prune_cache,
&sysfs_foreground_write_ratelimit_enabled,
&sysfs_copy_gc_enabled,
&sysfs_tiering_enabled,
@ -853,10 +725,8 @@ SHOW(bch2_dev)
sysfs_printf(uuid, "%pU\n", ca->uuid.b);
sysfs_hprint(bucket_size, bucket_bytes(ca));
sysfs_print(bucket_size_bytes, bucket_bytes(ca));
sysfs_hprint(block_size, block_bytes(c));
sysfs_print(block_size_bytes, block_bytes(c));
sysfs_print(bucket_size, bucket_bytes(ca));
sysfs_print(block_size, block_bytes(c));
sysfs_print(first_bucket, ca->mi.first_bucket);
sysfs_print(nbuckets, ca->mi.nbuckets);
sysfs_print(discard, ca->mi.discard);
@ -979,35 +849,46 @@ SYSFS_OPS(bch2_dev);
struct attribute *bch2_dev_files[] = {
&sysfs_uuid,
&sysfs_bucket_size,
&sysfs_bucket_size_bytes,
&sysfs_block_size,
&sysfs_block_size_bytes,
&sysfs_first_bucket,
&sysfs_nbuckets,
/* settings: */
&sysfs_discard,
&sysfs_cache_replacement_policy,
&sysfs_tier,
&sysfs_state_rw,
&sysfs_has_data,
&sysfs_has_metadata,
/* io stats: */
&sysfs_written,
&sysfs_btree_written,
&sysfs_metadata_written,
/* alloc info - data: */
&sysfs_dirty_data,
&sysfs_dirty_bytes,
&sysfs_cached_data,
&sysfs_cached_bytes,
/* alloc info - buckets: */
&sysfs_available_buckets,
&sysfs_free_buckets,
&sysfs_dirty_buckets,
&sysfs_cached_buckets,
&sysfs_meta_buckets,
&sysfs_alloc_buckets,
/* alloc info - other stats: */
&sysfs_read_priority_stats,
&sysfs_write_priority_stats,
&sysfs_fragmentation_stats,
&sysfs_oldest_gen_stats,
&sysfs_reserve_stats,
&sysfs_available_buckets,
&sysfs_free_buckets,
&sysfs_dirty_data,
&sysfs_dirty_bytes,
&sysfs_dirty_buckets,
&sysfs_cached_data,
&sysfs_cached_bytes,
&sysfs_cached_buckets,
&sysfs_meta_buckets,
&sysfs_alloc_buckets,
&sysfs_has_data,
&sysfs_has_metadata,
&sysfs_discard,
&sysfs_written,
&sysfs_btree_written,
&sysfs_metadata_written,
&sysfs_cache_replacement_policy,
&sysfs_tier,
&sysfs_state_rw,
/* debug: */
&sysfs_alloc_debug,
sysfs_pd_controller_files(copy_gc),