Update bcachefs sources to ed4aea2ad4 bcachefs: fix gcc warning

This commit is contained in:
Kent Overstreet 2018-05-04 14:04:31 -04:00
parent c598d91dcb
commit 018de5aa89
37 changed files with 4216 additions and 3299 deletions

View File

@ -1 +1 @@
edf5f38218f699e53913a549465f35d36c4418f7
ed4aea2ad4fa1b3891684cbd071d1a1ae9094342

View File

@ -69,6 +69,7 @@
#include "extents.h"
#include "io.h"
#include "journal.h"
#include "journal_io.h"
#include "super-io.h"
#include <linux/blkdev.h>

View File

@ -271,17 +271,19 @@ do { \
#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
#endif
/* name, frequency_units, duration_units */
#define BCH_TIME_STATS() \
BCH_TIME_STAT(btree_node_mem_alloc, sec, us) \
BCH_TIME_STAT(btree_gc, sec, ms) \
BCH_TIME_STAT(btree_split, sec, us) \
BCH_TIME_STAT(btree_sort, ms, us) \
BCH_TIME_STAT(btree_read, ms, us) \
BCH_TIME_STAT(journal_write, us, us) \
BCH_TIME_STAT(journal_delay, ms, us) \
BCH_TIME_STAT(journal_blocked, sec, ms) \
BCH_TIME_STAT(journal_flush_seq, us, us)
BCH_TIME_STAT(btree_node_mem_alloc) \
BCH_TIME_STAT(btree_gc) \
BCH_TIME_STAT(btree_split) \
BCH_TIME_STAT(btree_sort) \
BCH_TIME_STAT(btree_read) \
BCH_TIME_STAT(data_write) \
BCH_TIME_STAT(data_read) \
BCH_TIME_STAT(data_promote) \
BCH_TIME_STAT(journal_write) \
BCH_TIME_STAT(journal_delay) \
BCH_TIME_STAT(journal_blocked) \
BCH_TIME_STAT(journal_flush_seq)
#include "alloc_types.h"
#include "buckets_types.h"
@ -416,7 +418,12 @@ struct bch_dev {
struct work_struct io_error_work;
/* The rest of this all shows up in sysfs */
atomic_t latency[2];
atomic64_t cur_latency[2];
struct time_stats io_latency[2];
#define CONGESTED_MAX 1024
atomic_t congested;
u64 congested_last;
struct io_count __percpu *io_done;
};
@ -644,6 +651,7 @@ struct bch_fs {
struct bio_set bio_write;
struct mutex bio_bounce_pages_lock;
mempool_t bio_bounce_pages;
struct rhashtable promote_table;
mempool_t compression_bounce[2];
mempool_t compress_workspace[BCH_COMPRESSION_NR];
@ -708,12 +716,13 @@ struct bch_fs {
unsigned copy_gc_enabled:1;
unsigned rebalance_enabled:1;
unsigned rebalance_percent;
bool promote_whole_extents;
#define BCH_DEBUG_PARAM(name, description) bool name;
BCH_DEBUG_PARAMS_ALL()
#undef BCH_DEBUG_PARAM
#define BCH_TIME_STAT(name, frequency_units, duration_units) \
#define BCH_TIME_STAT(name) \
struct time_stats name##_time;
BCH_TIME_STATS()
#undef BCH_TIME_STAT

View File

@ -1088,13 +1088,14 @@ LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24);
LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28);
LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
struct bch_sb, flags[1], 28, 32);
LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40);
LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52);
LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64);
LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
struct bch_sb, flags[2], 0, 4);
/* Features: */
enum bch_sb_features {
BCH_FEATURE_LZ4 = 0,
@ -1193,29 +1194,41 @@ struct jset_entry {
};
};
#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
#define BCH_JSET_ENTRY_TYPES() \
x(btree_keys, 0) \
x(btree_root, 1) \
x(prio_ptrs, 2) \
x(blacklist, 3) \
x(blacklist_v2, 4)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
BCH_JSET_ENTRY_TYPES()
#undef x
BCH_JSET_ENTRY_NR
};
/*
* Journal sequence numbers can be blacklisted: bsets record the max sequence
* number of all the journal entries they contain updates for, so that on
* recovery we can ignore those bsets that contain index updates newer that what
* made it into the journal.
*
* This means that we can't reuse that journal_seq - we have to skip it, and
* then record that we skipped it so that the next time we crash and recover we
* don't think there was a missing journal entry.
*/
struct jset_entry_blacklist {
struct jset_entry entry;
__le64 seq;
};
#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
enum {
JOURNAL_ENTRY_BTREE_KEYS = 0,
JOURNAL_ENTRY_BTREE_ROOT = 1,
JOURNAL_ENTRY_PRIO_PTRS = 2, /* Obsolete */
/*
* Journal sequence numbers can be blacklisted: bsets record the max
* sequence number of all the journal entries they contain updates for,
* so that on recovery we can ignore those bsets that contain index
* updates newer that what made it into the journal.
*
* This means that we can't reuse that journal_seq - we have to skip it,
* and then record that we skipped it so that the next time we crash and
* recover we don't think there was a missing journal entry.
*/
JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED = 3,
struct jset_entry_blacklist_v2 {
struct jset_entry entry;
__le64 start;
__le64 end;
};
/*

View File

@ -13,7 +13,8 @@
#include "error.h"
#include "extents.h"
#include "io.h"
#include "journal.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
#include "super-io.h"
#include <trace/events/bcachefs.h>
@ -947,6 +948,7 @@ enum btree_validate_ret {
#define btree_err(type, c, b, i, msg, ...) \
({ \
__label__ out; \
char _buf[300], *out = _buf, *end = out + sizeof(_buf); \
\
out += btree_err_msg(c, b, i, b->written, write, out, end - out);\
@ -956,7 +958,11 @@ enum btree_validate_ret {
write == READ && \
!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \
mustfix_fsck_err(c, "%s", _buf); \
} else { \
goto out; \
} \
\
switch (write) { \
case READ: \
bch_err(c, "%s", _buf); \
\
switch (type) { \
@ -976,7 +982,17 @@ enum btree_validate_ret {
ret = BCH_FSCK_ERRORS_NOT_FIXED; \
goto fsck_err; \
} \
break; \
case WRITE: \
bch_err(c, "corrupt metadata before write: %s", _buf); \
\
if (bch2_fs_inconsistent(c)) { \
ret = BCH_FSCK_ERRORS_NOT_FIXED; \
goto fsck_err; \
} \
break; \
} \
out: \
true; \
})
@ -1323,37 +1339,48 @@ static void btree_node_read_work(struct work_struct *work)
struct btree_read_bio *rb =
container_of(work, struct btree_read_bio, work);
struct bch_fs *c = rb->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
struct btree *b = rb->bio.bi_private;
struct bio *bio = &rb->bio;
struct bch_devs_mask avoid;
bool can_retry;
memset(&avoid, 0, sizeof(avoid));
goto start;
do {
while (1) {
bch_info(c, "retrying read");
ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
bio_reset(bio);
bio_set_dev(bio, rb->pick.ca->disk_sb.bdev);
bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
submit_bio_wait(bio);
start:
bch2_dev_io_err_on(bio->bi_status, rb->pick.ca, "btree read");
percpu_ref_put(&rb->pick.ca->io_ref);
__set_bit(rb->pick.ca->dev_idx, avoid.d);
rb->pick = bch2_btree_pick_ptr(c, b, &avoid);
if (rb->have_ioref) {
bio_set_dev(bio, ca->disk_sb.bdev);
submit_bio_wait(bio);
} else {
bio->bi_status = BLK_STS_REMOVED;
}
start:
bch2_dev_io_err_on(bio->bi_status, ca, "btree read");
if (rb->have_ioref)
percpu_ref_put(&ca->io_ref);
rb->have_ioref = false;
__set_bit(rb->pick.ptr.dev, avoid.d);
can_retry = bch2_btree_pick_ptr(c, b, &avoid, &rb->pick) > 0;
if (!bio->bi_status &&
!bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca)))
goto out;
} while (!IS_ERR_OR_NULL(rb->pick.ca));
!bch2_btree_node_read_done(c, b, can_retry))
break;
if (!can_retry) {
set_btree_node_read_error(b);
out:
if (!IS_ERR_OR_NULL(rb->pick.ca))
percpu_ref_put(&rb->pick.ca->io_ref);
break;
}
}
bch2_time_stats_update(&c->btree_read_time, rb->start_time);
bio_put(&rb->bio);
@ -1365,10 +1392,13 @@ static void btree_node_read_endio(struct bio *bio)
{
struct btree_read_bio *rb =
container_of(bio, struct btree_read_bio, bio);
struct bch_fs *c = rb->c;
bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ);
if (rb->have_ioref) {
struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
bch2_latency_acct(ca, rb->start_time, READ);
}
INIT_WORK(&rb->work, btree_node_read_work);
queue_work(system_unbound_wq, &rb->work);
}
@ -1377,42 +1407,59 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
{
struct extent_pick_ptr pick;
struct btree_read_bio *rb;
struct bch_dev *ca;
struct bio *bio;
int ret;
trace_btree_read(c, b);
pick = bch2_btree_pick_ptr(c, b, NULL);
if (bch2_fs_fatal_err_on(!pick.ca, c,
ret = bch2_btree_pick_ptr(c, b, NULL, &pick);
if (bch2_fs_fatal_err_on(ret <= 0, c,
"btree node read error: no device to read from")) {
set_btree_node_read_error(b);
return;
}
ca = bch_dev_bkey_exists(c, pick.ptr.dev);
bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
rb = container_of(bio, struct btree_read_bio, bio);
rb->c = c;
rb->start_time = local_clock();
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
rb->pick = pick;
bio_set_dev(bio, pick.ca->disk_sb.bdev);
INIT_WORK(&rb->work, btree_node_read_work);
bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
bio->bi_end_io = btree_node_read_endio;
bio->bi_private = b;
bch2_bio_map(bio, b->data);
this_cpu_add(pick.ca->io_done->sectors[READ][BCH_DATA_BTREE],
bio_sectors(bio));
set_btree_node_read_in_flight(b);
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE],
bio_sectors(bio));
bio_set_dev(bio, ca->disk_sb.bdev);
if (sync) {
submit_bio_wait(bio);
bio->bi_private = b;
btree_node_read_work(&rb->work);
} else {
bio->bi_end_io = btree_node_read_endio;
bio->bi_private = b;
submit_bio(bio);
}
} else {
bio->bi_status = BLK_STS_REMOVED;
if (sync)
btree_node_read_work(&rb->work);
else
queue_work(system_unbound_wq, &rb->work);
}
}
int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
@ -1593,20 +1640,21 @@ static void btree_node_write_endio(struct bio *bio)
struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
struct bch_write_bio *orig = parent ?: wbio;
struct bch_fs *c = wbio->c;
struct bch_dev *ca = wbio->ca;
struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
unsigned long flags;
bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
if (wbio->have_ioref)
bch2_latency_acct(ca, wbio->submit_time, WRITE);
if (bio->bi_status == BLK_STS_REMOVED ||
bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
bch2_meta_write_fault("btree")) {
spin_lock_irqsave(&c->btree_write_error_lock, flags);
bch2_dev_list_add_dev(&orig->failed, ca->dev_idx);
bch2_dev_list_add_dev(&orig->failed, wbio->dev);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
}
if (wbio->have_io_ref)
if (wbio->have_ioref)
percpu_ref_put(&ca->io_ref);
if (parent) {

View File

@ -12,8 +12,8 @@ struct btree_iter;
struct btree_read_bio {
struct bch_fs *c;
unsigned submit_time_us;
u64 start_time;
unsigned have_ioref:1;
struct extent_pick_ptr pick;
struct work_struct work;
struct bio bio;

View File

@ -748,7 +748,9 @@ static void btree_iter_prefetch(struct btree_iter *iter)
struct btree_node_iter node_iter = l->iter;
struct bkey_packed *k;
BKEY_PADDED(k) tmp;
unsigned nr = iter->level > 1 ? 1 : 8;
unsigned nr = test_bit(BCH_FS_STARTED, &iter->c->flags)
? (iter->level > 1 ? 0 : 2)
: (iter->level > 1 ? 1 : 16);
bool was_locked = btree_node_locked(iter, iter->level);
while (nr) {

View File

@ -12,6 +12,7 @@
#include "buckets.h"
#include "extents.h"
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
#include "replicas.h"
#include "super-io.h"

View File

@ -8,6 +8,7 @@
#include "debug.h"
#include "extents.h"
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
#include <linux/sort.h>
@ -137,7 +138,7 @@ void bch2_btree_journal_key(struct btree_insert *trans,
EBUG_ON(trans->journal_res.ref !=
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
if (likely(trans->journal_res.ref)) {
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
u64 seq = trans->journal_res.seq;
bool needs_whiteout = insert->k.needs_whiteout;
@ -155,12 +156,16 @@ void bch2_btree_journal_key(struct btree_insert *trans,
btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
}
if (unlikely(!journal_pin_active(&w->journal)))
bch2_journal_pin_add(j, &trans->journal_res,
&w->journal,
if (unlikely(!journal_pin_active(&w->journal))) {
u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
? trans->journal_res.seq
: j->replay_journal_seq;
bch2_journal_pin_add(j, seq, &w->journal,
btree_node_write_idx(b) == 0
? btree_node_flush0
: btree_node_flush1);
}
if (unlikely(!btree_node_dirty(b)))
set_btree_node_dirty(b);

View File

@ -142,7 +142,8 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca,
u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
if (WARN_ONCE(stats.buckets_unavailable > total,
"buckets_unavailable overflow\n"))
"buckets_unavailable overflow (%llu > %llu)\n",
stats.buckets_unavailable, total))
return 0;
return total - stats.buckets_unavailable;

View File

@ -36,6 +36,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
struct btree_node *n_ondisk, *n_sorted, *n_inmemory;
struct bset *sorted, *inmemory;
struct extent_pick_ptr pick;
struct bch_dev *ca;
struct bio *bio;
if (c->opts.nochanges)
@ -54,12 +55,15 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
v->btree_id = b->btree_id;
bch2_btree_keys_init(v, &c->expensive_debug_checks);
pick = bch2_btree_pick_ptr(c, b, NULL);
if (IS_ERR_OR_NULL(pick.ca))
if (bch2_btree_pick_ptr(c, b, NULL, &pick) <= 0)
return;
ca = bch_dev_bkey_exists(c, pick.ptr.dev);
if (!bch2_dev_get_ioref(ca, READ))
return;
bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
bio_set_dev(bio, pick.ca->disk_sb.bdev);
bio_set_dev(bio, ca->disk_sb.bdev);
bio->bi_opf = REQ_OP_READ|REQ_META;
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
@ -68,7 +72,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
submit_bio_wait(bio);
bio_put(bio);
percpu_ref_put(&pick.ca->io_ref);
percpu_ref_put(&ca->io_ref);
memcpy(n_ondisk, n_sorted, btree_bytes(c));

View File

@ -3,20 +3,22 @@
#include "io.h"
#include "super.h"
void bch2_inconsistent_error(struct bch_fs *c)
bool bch2_inconsistent_error(struct bch_fs *c)
{
set_bit(BCH_FS_ERROR, &c->flags);
switch (c->opts.errors) {
case BCH_ON_ERROR_CONTINUE:
break;
return false;
case BCH_ON_ERROR_RO:
if (bch2_fs_emergency_read_only(c))
bch_err(c, "emergency read only");
break;
return true;
case BCH_ON_ERROR_PANIC:
panic(bch2_fmt(c, "panic after error"));
break;
return true;
default:
BUG();
}
}

View File

@ -45,13 +45,13 @@ do { \
* BCH_ON_ERROR_CONTINUE mode
*/
void bch2_inconsistent_error(struct bch_fs *);
bool bch2_inconsistent_error(struct bch_fs *);
#define bch2_fs_inconsistent(c, ...) \
do { \
({ \
bch_err(c, __VA_ARGS__); \
bch2_inconsistent_error(c); \
} while (0)
})
#define bch2_fs_inconsistent_on(cond, c, ...) \
({ \

View File

@ -588,58 +588,51 @@ out:
return out - buf;
}
static inline bool dev_latency_better(struct bch_dev *dev1,
struct bch_dev *dev2)
static inline bool dev_latency_better(struct bch_fs *c,
const struct bch_extent_ptr *ptr1,
const struct bch_extent_ptr *ptr2)
{
unsigned l1 = atomic_read(&dev1->latency[READ]);
unsigned l2 = atomic_read(&dev2->latency[READ]);
struct bch_dev *dev1 = bch_dev_bkey_exists(c, ptr1->dev);
struct bch_dev *dev2 = bch_dev_bkey_exists(c, ptr2->dev);
u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
/* Pick at random, biased in favor of the faster device: */
return bch2_rand_range(l1 + l2) > l1;
}
static void extent_pick_read_device(struct bch_fs *c,
static int extent_pick_read_device(struct bch_fs *c,
struct bkey_s_c_extent e,
struct bch_devs_mask *avoid,
struct extent_pick_ptr *pick)
{
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
struct bch_dev *ca;
int ret = 0;
extent_for_each_ptr_crc(e, ptr, crc) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
ca = bch_dev_bkey_exists(c, ptr->dev);
if (ptr->cached && ptr_stale(ca, ptr))
continue;
if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
if (avoid && test_bit(ptr->dev, avoid->d))
continue;
if (avoid) {
if (test_bit(ca->dev_idx, avoid->d))
if (ret && !dev_latency_better(c, ptr, &pick->ptr))
continue;
if (pick->ca &&
test_bit(pick->ca->dev_idx, avoid->d))
goto use;
}
if (pick->ca && !dev_latency_better(ca, pick->ca))
continue;
use:
if (!percpu_ref_tryget(&ca->io_ref))
continue;
if (pick->ca)
percpu_ref_put(&pick->ca->io_ref);
*pick = (struct extent_pick_ptr) {
.ptr = *ptr,
.crc = crc,
.ca = ca,
};
ret = 1;
}
return ret;
}
/* Btree ptrs */
@ -759,16 +752,12 @@ void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
#undef p
}
struct extent_pick_ptr
bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
struct bch_devs_mask *avoid)
int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
struct bch_devs_mask *avoid,
struct extent_pick_ptr *pick)
{
struct extent_pick_ptr pick = { .ca = NULL };
extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
avoid, &pick);
return pick;
return extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
avoid, pick);
}
/* Extents */
@ -2057,37 +2046,33 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
* Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
* other devices, it will still pick a pointer from avoid.
*/
void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
struct bch_devs_mask *avoid,
struct extent_pick_ptr *ret)
struct extent_pick_ptr *pick)
{
struct bkey_s_c_extent e;
int ret;
switch (k.k->type) {
case KEY_TYPE_DELETED:
case KEY_TYPE_DISCARD:
case KEY_TYPE_COOKIE:
ret->ca = NULL;
return;
return 0;
case KEY_TYPE_ERROR:
ret->ca = ERR_PTR(-EIO);
return;
return -EIO;
case BCH_EXTENT:
case BCH_EXTENT_CACHED:
e = bkey_s_c_to_extent(k);
ret->ca = NULL;
ret = extent_pick_read_device(c, bkey_s_c_to_extent(k),
avoid, pick);
extent_pick_read_device(c, bkey_s_c_to_extent(k), avoid, ret);
if (!ret && !bkey_extent_is_cached(k.k))
ret = -EIO;
if (!ret->ca && !bkey_extent_is_cached(e.k))
ret->ca = ERR_PTR(-EIO);
return;
return ret;
case BCH_RESERVATION:
ret->ca = NULL;
return;
return 0;
default:
BUG();

View File

@ -53,11 +53,11 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
struct btree *,
struct btree_node_iter_large *);
struct extent_pick_ptr
bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
struct bch_devs_mask *avoid);
int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
struct bch_devs_mask *avoid,
struct extent_pick_ptr *);
void bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
struct bch_devs_mask *,
struct extent_pick_ptr *);

View File

@ -21,7 +21,6 @@ struct bch_extent_crc_unpacked {
struct extent_pick_ptr {
struct bch_extent_ptr ptr;
struct bch_extent_crc_unpacked crc;
struct bch_dev *ca;
};
#endif /* _BCACHEFS_EXTENTS_TYPES_H */

View File

@ -20,6 +20,7 @@
#include <linux/migrate.h>
#include <linux/mmu_context.h>
#include <linux/pagevec.h>
#include <linux/sched/signal.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/uio.h>
#include <linux/writeback.h>
@ -124,13 +125,13 @@ static void bch2_quota_reservation_put(struct bch_fs *c,
if (!res->sectors)
return;
mutex_lock(&inode->ei_update_lock);
mutex_lock(&inode->ei_quota_lock);
BUG_ON(res->sectors > inode->ei_quota_reserved);
bch2_quota_acct(c, inode->ei_qid, Q_SPC,
-((s64) res->sectors), BCH_QUOTA_PREALLOC);
inode->ei_quota_reserved -= res->sectors;
mutex_unlock(&inode->ei_update_lock);
mutex_unlock(&inode->ei_quota_lock);
res->sectors = 0;
}
@ -143,14 +144,14 @@ static int bch2_quota_reservation_add(struct bch_fs *c,
{
int ret;
mutex_lock(&inode->ei_update_lock);
mutex_lock(&inode->ei_quota_lock);
ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
check_enospc ? BCH_QUOTA_PREALLOC : BCH_QUOTA_NOCHECK);
if (likely(!ret)) {
inode->ei_quota_reserved += sectors;
res->sectors += sectors;
}
mutex_unlock(&inode->ei_update_lock);
mutex_unlock(&inode->ei_quota_lock);
return ret;
}
@ -195,9 +196,10 @@ static int __must_check bch2_write_inode_size(struct bch_fs *c,
return __bch2_write_inode(c, inode, inode_set_size, &new_size);
}
static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
struct quota_res *quota_res, int sectors)
{
mutex_lock(&inode->ei_quota_lock);
#ifdef CONFIG_BCACHEFS_QUOTA
if (quota_res && sectors > 0) {
BUG_ON(sectors > quota_res->sectors);
@ -210,14 +212,7 @@ static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
}
#endif
inode->v.i_blocks += sectors;
}
static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
struct quota_res *quota_res, int sectors)
{
mutex_lock(&inode->ei_update_lock);
__i_sectors_acct(c, inode, quota_res, sectors);
mutex_unlock(&inode->ei_update_lock);
mutex_unlock(&inode->ei_quota_lock);
}
/* i_sectors accounting: */
@ -265,7 +260,7 @@ static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
if (h->new_i_size != U64_MAX)
i_size_write(&h->inode->v, h->new_i_size);
__i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h);
mutex_unlock(&h->inode->ei_update_lock);
@ -773,6 +768,7 @@ void bch2_invalidatepage(struct page *page, unsigned int offset,
int bch2_releasepage(struct page *page, gfp_t gfp_mask)
{
/* XXX: this can't take locks that are held while we allocate memory */
EBUG_ON(!PageLocked(page));
EBUG_ON(PageWriteback(page));
@ -881,10 +877,12 @@ static int readpage_add_page(struct readpages_iter *iter, struct page *page)
int ret;
prefetchw(&page->flags);
page_state_init_for_read(page);
ret = add_to_page_cache_lru(page, iter->mapping,
page->index, GFP_NOFS);
if (!ret)
page_state_init_for_read(page);
put_page(page);
return ret;
}
@ -992,12 +990,13 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
int flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE;
rbio->c = c;
rbio->start_time = local_clock();
while (1) {
struct extent_pick_ptr pick;
BKEY_PADDED(k) tmp;
struct bkey_s_c k;
unsigned bytes;
bool is_last;
bch2_btree_iter_set_pos(iter, POS(inum, bio->bi_iter.bi_sector));
@ -1016,45 +1015,37 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_iter_unlock(iter);
k = bkey_i_to_s_c(&tmp.k);
bch2_extent_pick_ptr(c, k, NULL, &pick);
if (IS_ERR(pick.ca)) {
bcache_io_error(c, bio, "no device to read from");
bio_endio(bio);
return;
if (readpages_iter) {
bool want_full_extent = false;
if (bkey_extent_is_data(k.k)) {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
extent_for_each_ptr_crc(e, ptr, crc)
want_full_extent |= !!crc.csum_type |
!!crc.compression_type;
}
if (readpages_iter)
readpage_bio_extend(readpages_iter,
bio, k.k->p.offset,
pick.ca &&
(pick.crc.csum_type ||
pick.crc.compression_type));
want_full_extent);
}
bytes = (min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
bio->bi_iter.bi_sector) << 9;
is_last = bytes == bio->bi_iter.bi_size;
swap(bio->bi_iter.bi_size, bytes);
if (bytes == bio->bi_iter.bi_size)
flags |= BCH_READ_LAST_FRAGMENT;
if (bkey_extent_is_allocation(k.k))
bch2_add_page_sectors(bio, k);
if (pick.ca) {
if (!is_last) {
bio_inc_remaining(&rbio->bio);
flags |= BCH_READ_MUST_CLONE;
trace_read_split(&rbio->bio);
}
bch2_read_extent(c, rbio, k, flags);
bch2_read_extent(c, rbio, bkey_s_c_to_extent(k),
&pick, flags);
} else {
zero_fill_bio(bio);
if (is_last)
bio_endio(bio);
}
if (is_last)
if (flags & BCH_READ_LAST_FRAGMENT)
return;
swap(bio->bi_iter.bi_size, bytes);
@ -1487,6 +1478,194 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
return copied;
}
#define WRITE_BATCH_PAGES 32
static int __bch2_buffered_write(struct bch_inode_info *inode,
struct address_space *mapping,
struct iov_iter *iter,
loff_t pos, unsigned len)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct page *pages[WRITE_BATCH_PAGES];
unsigned long index = pos >> PAGE_SHIFT;
unsigned offset = pos & (PAGE_SIZE - 1);
unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
unsigned i, copied = 0, nr_pages_copied = 0;
int ret = 0;
BUG_ON(!len);
BUG_ON(nr_pages > ARRAY_SIZE(pages));
for (i = 0; i < nr_pages; i++) {
pages[i] = grab_cache_page_write_begin(mapping, index + i, 0);
if (!pages[i]) {
nr_pages = i;
ret = -ENOMEM;
goto out;
}
}
if (offset && !PageUptodate(pages[0])) {
ret = bch2_read_single_page(pages[0], mapping);
if (ret)
goto out;
}
if ((pos + len) & (PAGE_SIZE - 1) &&
!PageUptodate(pages[nr_pages - 1])) {
if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) {
zero_user(pages[nr_pages - 1], 0, PAGE_SIZE);
} else {
ret = bch2_read_single_page(pages[nr_pages - 1], mapping);
if (ret)
goto out;
}
}
for (i = 0; i < nr_pages; i++) {
ret = bch2_get_page_reservation(c, inode, pages[i], true);
if (ret && !PageUptodate(pages[i])) {
ret = bch2_read_single_page(pages[i], mapping);
if (ret)
goto out;
ret = bch2_get_page_reservation(c, inode, pages[i], true);
}
if (ret)
goto out;
}
if (mapping_writably_mapped(mapping))
for (i = 0; i < nr_pages; i++)
flush_dcache_page(pages[i]);
while (copied < len) {
struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
unsigned pg_bytes = min_t(unsigned, len - copied,
PAGE_SIZE - pg_offset);
unsigned pg_copied = iov_iter_copy_from_user_atomic(page,
iter, pg_offset, pg_bytes);
if (!pg_copied)
break;
flush_dcache_page(page);
iov_iter_advance(iter, pg_copied);
copied += pg_copied;
}
if (!copied)
goto out;
nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
inode->ei_last_dirtied = (unsigned long) current;
if (pos + copied > inode->v.i_size)
i_size_write(&inode->v, pos + copied);
if (copied < len &&
((offset + copied) & (PAGE_SIZE - 1))) {
struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
if (!PageUptodate(page)) {
zero_user(page, 0, PAGE_SIZE);
copied -= (offset + copied) & (PAGE_SIZE - 1);
}
}
out:
for (i = 0; i < nr_pages_copied; i++) {
if (!PageUptodate(pages[i]))
SetPageUptodate(pages[i]);
if (!PageDirty(pages[i]))
set_page_dirty(pages[i]);
unlock_page(pages[i]);
put_page(pages[i]);
}
for (i = nr_pages_copied; i < nr_pages; i++) {
if (!PageDirty(pages[i]))
bch2_put_page_reservation(c, inode, pages[i]);
unlock_page(pages[i]);
put_page(pages[i]);
}
return copied ?: ret;
}
static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct bch_inode_info *inode = file_bch_inode(file);
loff_t pos = iocb->ki_pos;
ssize_t written = 0;
int ret = 0;
pagecache_add_get(&mapping->add_lock);
do {
unsigned offset = pos & (PAGE_SIZE - 1);
unsigned bytes = min_t(unsigned long, iov_iter_count(iter),
PAGE_SIZE * WRITE_BATCH_PAGES - offset);
again:
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*
* Not only is this an optimisation, but it is also required
* to check that the address is actually valid, when atomic
* usercopies are used, below.
*/
if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
bytes = min_t(unsigned long, iov_iter_count(iter),
PAGE_SIZE - offset);
if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
ret = -EFAULT;
break;
}
}
if (unlikely(fatal_signal_pending(current))) {
ret = -EINTR;
break;
}
ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
if (unlikely(ret < 0))
break;
cond_resched();
if (unlikely(ret == 0)) {
/*
* If we were unable to copy any data at all, we must
* fall back to a single segment length write.
*
* If we didn't fallback here, we could livelock
* because not all segments in the iov can be copied at
* once without a pagefault.
*/
bytes = min_t(unsigned long, PAGE_SIZE - offset,
iov_iter_single_seg_count(iter));
goto again;
}
pos += ret;
written += ret;
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(iter));
pagecache_add_put(&mapping->add_lock);
return written ? written : ret;
}
/* O_DIRECT reads */
static void bch2_dio_read_complete(struct closure *cl)
@ -1822,7 +2001,7 @@ static ssize_t __bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = iocb->ki_flags & IOCB_DIRECT
? bch2_direct_write(iocb, from)
: generic_perform_write(file, from, iocb->ki_pos);
: bch2_buffered_write(iocb, from);
if (likely(ret > 0))
iocb->ki_pos += ret;

View File

@ -1028,6 +1028,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
inode_init_once(&inode->v);
mutex_init(&inode->ei_update_lock);
mutex_init(&inode->ei_quota_lock);
inode->ei_journal_seq = 0;
return &inode->v;

View File

@ -15,6 +15,8 @@ struct bch_inode_info {
u64 ei_journal_seq;
u64 ei_quota_reserved;
unsigned long ei_last_dirtied;
struct mutex ei_quota_lock;
struct bch_qid ei_qid;
struct bch_hash_info ei_str_hash;

File diff suppressed because it is too large Load Diff

View File

@ -16,7 +16,7 @@ void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t);
void bch2_latency_acct(struct bch_dev *, unsigned, int);
void bch2_latency_acct(struct bch_dev *, u64, int);
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *);
@ -99,40 +99,28 @@ struct cache_promote_op;
struct extent_pick_ptr;
int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
struct bkey_s_c_extent e, struct extent_pick_ptr *,
unsigned);
void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
u64, struct bch_devs_mask *, unsigned);
struct bkey_s_c, struct bch_devs_mask *, unsigned);
void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
enum bch_read_flags {
BCH_READ_RETRY_IF_STALE = 1 << 0,
BCH_READ_MAY_PROMOTE = 1 << 1,
BCH_READ_USER_MAPPED = 1 << 2,
BCH_READ_NODECODE = 1 << 3,
BCH_READ_LAST_FRAGMENT = 1 << 4,
/* internal: */
BCH_READ_MUST_BOUNCE = 1 << 4,
BCH_READ_MUST_CLONE = 1 << 5,
BCH_READ_IN_RETRY = 1 << 6,
BCH_READ_MUST_BOUNCE = 1 << 5,
BCH_READ_MUST_CLONE = 1 << 6,
BCH_READ_IN_RETRY = 1 << 7,
};
static inline void bch2_read_extent(struct bch_fs *c,
struct bch_read_bio *rbio,
struct bkey_s_c_extent e,
struct extent_pick_ptr *pick,
struct bkey_s_c k,
unsigned flags)
{
__bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags);
}
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
u64 inode)
{
BUG_ON(rbio->_state);
__bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL,
BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE|
BCH_READ_USER_MAPPED);
__bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags);
}
static inline struct bch_read_bio *rbio_init(struct bio *bio,
@ -146,4 +134,7 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio,
return rbio;
}
void bch2_fs_io_exit(struct bch_fs *);
int bch2_fs_io_init(struct bch_fs *);
#endif /* _BCACHEFS_IO_H */

View File

@ -14,6 +14,8 @@
struct bch_read_bio {
struct bch_fs *c;
u64 start_time;
u64 submit_time;
/*
* Reads will often have to be split, and if the extent being read from
@ -35,17 +37,19 @@ struct bch_read_bio {
*/
struct bvec_iter bvec_iter;
unsigned submit_time_us;
u8 flags;
u16 flags;
union {
struct {
u8 bounce:1,
u16 bounce:1,
split:1,
kmalloc:1,
have_ioref:1,
narrow_crcs:1,
hole:1,
retry:2,
context:2;
};
u8 _state;
u16 _state;
};
struct bch_devs_list devs_have;
@ -66,20 +70,20 @@ struct bch_read_bio {
struct bch_write_bio {
struct bch_fs *c;
struct bch_dev *ca;
struct bch_write_bio *parent;
u64 submit_time;
struct bch_devs_list failed;
u8 order;
u8 dev;
unsigned split:1,
bounce:1,
put_bio:1,
have_io_ref:1,
have_ioref:1,
used_mempool:1;
unsigned submit_time_us;
struct bio bio;
};
@ -87,6 +91,7 @@ struct bch_write_op {
struct closure cl;
struct bch_fs *c;
struct workqueue_struct *io_wq;
u64 start_time;
unsigned written; /* sectors */
u16 flags;

File diff suppressed because it is too large Load Diff

View File

@ -112,72 +112,37 @@
#include "journal_types.h"
/*
* Only used for holding the journal entries we read in btree_journal_read()
* during cache_registration
*/
struct journal_replay {
struct list_head list;
struct bch_devs_list devs;
/* must be last: */
struct jset j;
};
static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
struct jset_entry *entry, unsigned type)
{
while (entry < vstruct_last(jset)) {
if (entry->type == type)
return entry;
entry = vstruct_next(entry);
}
return NULL;
}
#define for_each_jset_entry_type(entry, jset, type) \
for (entry = (jset)->start; \
(entry = __jset_entry_type_next(jset, entry, type)); \
entry = vstruct_next(entry))
#define for_each_jset_key(k, _n, entry, jset) \
for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \
vstruct_for_each_safe(entry, k, _n)
#define JOURNAL_PIN (32 * 1024)
static inline bool journal_pin_active(struct journal_entry_pin *pin)
{
return pin->pin_list != NULL;
}
static inline struct journal_entry_pin_list *
journal_seq_pin(struct journal *j, u64 seq)
{
return &j->pin.data[seq & j->pin.mask];
}
u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
void bch2_journal_pin_add(struct journal *, struct journal_res *,
struct journal_entry_pin *, journal_pin_flush_fn);
void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
void bch2_journal_pin_add_if_older(struct journal *,
struct journal_entry_pin *,
struct journal_entry_pin *,
journal_pin_flush_fn);
int bch2_journal_flush_pins(struct journal *, u64);
int bch2_journal_flush_all_pins(struct journal *);
struct closure;
struct bch_fs;
struct keylist;
struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *,
enum btree_id, unsigned *);
static inline void journal_wake(struct journal *j)
{
wake_up(&j->wait);
closure_wake_up(&j->async_wait);
}
int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
static inline struct journal_buf *journal_cur_buf(struct journal *j)
{
return j->buf + j->reservations.idx;
}
static inline struct journal_buf *journal_prev_buf(struct journal *j)
{
return j->buf + !j->reservations.idx;
}
/* Sequence number of oldest dirty journal entry */
static inline u64 journal_last_seq(struct journal *j)
{
return j->pin.front;
}
static inline u64 journal_cur_seq(struct journal *j)
{
BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
return j->pin.back - 1;
}
u64 bch2_inode_journal_seq(struct journal *, u64);
@ -213,21 +178,18 @@ static inline unsigned jset_u64s(unsigned u64s)
return u64s + sizeof(struct jset_entry) / sizeof(u64);
}
static inline void bch2_journal_add_entry_at(struct journal_buf *buf,
unsigned offset,
unsigned type, enum btree_id id,
unsigned level,
const void *data, size_t u64s)
static inline struct jset_entry *
bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
{
struct jset_entry *entry = vstruct_idx(buf->data, offset);
struct jset *jset = buf->data;
struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));
memset(entry, 0, sizeof(*entry));
entry->u64s = cpu_to_le16(u64s);
entry->btree_id = id;
entry->level = level;
entry->type = type;
memcpy_u64s(entry->_data, data, u64s);
le32_add_cpu(&jset->u64s, jset_u64s(u64s));
return entry;
}
static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
@ -236,21 +198,27 @@ static inline void bch2_journal_add_entry(struct journal *j, struct journal_res
const void *data, unsigned u64s)
{
struct journal_buf *buf = &j->buf[res->idx];
struct jset_entry *entry = vstruct_idx(buf->data, res->offset);
unsigned actual = jset_u64s(u64s);
EBUG_ON(!res->ref);
EBUG_ON(actual > res->u64s);
bch2_journal_add_entry_at(buf, res->offset, type,
id, level, data, u64s);
res->offset += actual;
res->u64s -= actual;
memset(entry, 0, sizeof(*entry));
entry->u64s = cpu_to_le16(u64s);
entry->type = type;
entry->btree_id = id;
entry->level = level;
memcpy_u64s(entry->_data, data, u64s);
}
static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
enum btree_id id, const struct bkey_i *k)
{
bch2_journal_add_entry(j, res, JOURNAL_ENTRY_BTREE_KEYS,
bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
id, 0, k, k->k.u64s);
}
@ -292,7 +260,7 @@ static inline void bch2_journal_res_put(struct journal *j,
while (res->u64s)
bch2_journal_add_entry(j, res,
JOURNAL_ENTRY_BTREE_KEYS,
BCH_JSET_ENTRY_btree_keys,
0, 0, NULL, 0);
bch2_journal_buf_put(j, res->idx, false);
@ -368,7 +336,6 @@ void bch2_journal_meta_async(struct journal *, struct closure *);
int bch2_journal_flush_seq(struct journal *, u64);
int bch2_journal_flush(struct journal *);
int bch2_journal_meta(struct journal *);
int bch2_journal_flush_device(struct journal *, int);
void bch2_journal_halt(struct journal *);
@ -385,10 +352,8 @@ static inline bool journal_flushes_device(struct bch_dev *ca)
return true;
}
void bch2_journal_start(struct bch_fs *);
int bch2_journal_mark(struct bch_fs *, struct list_head *);
void bch2_journal_entries_free(struct list_head *);
int bch2_journal_read(struct bch_fs *, struct list_head *);
int bch2_journal_replay(struct bch_fs *, struct list_head *);
static inline void bch2_journal_set_replay_done(struct journal *j)
@ -404,6 +369,7 @@ int bch2_dev_journal_alloc(struct bch_dev *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
void bch2_fs_journal_stop(struct journal *);
void bch2_fs_journal_start(struct journal *);
void bch2_dev_journal_exit(struct bch_dev *);
int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
void bch2_fs_journal_exit(struct journal *);

1423
libbcachefs/journal_io.c Normal file

File diff suppressed because it is too large Load Diff

45
libbcachefs/journal_io.h Normal file
View File

@ -0,0 +1,45 @@
#ifndef _BCACHEFS_JOURNAL_IO_H
#define _BCACHEFS_JOURNAL_IO_H
struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *,
enum btree_id, unsigned *);
/*
* Only used for holding the journal entries we read in btree_journal_read()
* during cache_registration
*/
struct journal_replay {
struct list_head list;
struct bch_devs_list devs;
/* must be last: */
struct jset j;
};
static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
struct jset_entry *entry, unsigned type)
{
while (entry < vstruct_last(jset)) {
if (entry->type == type)
return entry;
entry = vstruct_next(entry);
}
return NULL;
}
#define for_each_jset_entry_type(entry, jset, type) \
for (entry = (jset)->start; \
(entry = __jset_entry_type_next(jset, entry, type)); \
entry = vstruct_next(entry))
#define for_each_jset_key(k, _n, entry, jset) \
for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \
vstruct_for_each_safe(entry, k, _n)
int bch2_journal_read(struct bch_fs *, struct list_head *);
int bch2_journal_entry_sectors(struct journal *);
void bch2_journal_write(struct closure *);
#endif /* _BCACHEFS_JOURNAL_IO_H */

View File

@ -0,0 +1,411 @@
#include "bcachefs.h"
#include "journal.h"
#include "journal_reclaim.h"
#include "replicas.h"
#include "super.h"
/*
* Journal entry pinning - machinery for holding a reference on a given journal
* entry, holding it open to ensure it gets replayed during recovery:
*/
static inline u64 journal_pin_seq(struct journal *j,
struct journal_entry_pin_list *pin_list)
{
return fifo_entry_idx_abs(&j->pin, pin_list);
}
u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
{
u64 ret = 0;
spin_lock(&j->lock);
if (journal_pin_active(pin))
ret = journal_pin_seq(j, pin->pin_list);
spin_unlock(&j->lock);
return ret;
}
static inline void __journal_pin_add(struct journal *j,
struct journal_entry_pin_list *pin_list,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
BUG_ON(journal_pin_active(pin));
BUG_ON(!atomic_read(&pin_list->count));
atomic_inc(&pin_list->count);
pin->pin_list = pin_list;
pin->flush = flush_fn;
if (flush_fn)
list_add(&pin->list, &pin_list->list);
else
INIT_LIST_HEAD(&pin->list);
/*
* If the journal is currently full, we might want to call flush_fn
* immediately:
*/
journal_wake(j);
}
void bch2_journal_pin_add(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
spin_lock(&j->lock);
__journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn);
spin_unlock(&j->lock);
}
static inline void __journal_pin_drop(struct journal *j,
struct journal_entry_pin *pin)
{
struct journal_entry_pin_list *pin_list = pin->pin_list;
if (!journal_pin_active(pin))
return;
pin->pin_list = NULL;
list_del_init(&pin->list);
/*
* Unpinning a journal entry make make journal_next_bucket() succeed, if
* writing a new last_seq will now make another bucket available:
*/
if (atomic_dec_and_test(&pin_list->count) &&
pin_list == &fifo_peek_front(&j->pin))
bch2_journal_reclaim_fast(j);
}
void bch2_journal_pin_drop(struct journal *j,
struct journal_entry_pin *pin)
{
spin_lock(&j->lock);
__journal_pin_drop(j, pin);
spin_unlock(&j->lock);
}
void bch2_journal_pin_add_if_older(struct journal *j,
struct journal_entry_pin *src_pin,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
spin_lock(&j->lock);
if (journal_pin_active(src_pin) &&
(!journal_pin_active(pin) ||
journal_pin_seq(j, src_pin->pin_list) <
journal_pin_seq(j, pin->pin_list))) {
__journal_pin_drop(j, pin);
__journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
}
spin_unlock(&j->lock);
}
/*
* Journal reclaim: flush references to open journal entries to reclaim space in
* the journal
*
* May be done by the journal code in the background as needed to free up space
* for more journal entries, or as part of doing a clean shutdown, or to migrate
* data off of a specific device:
*/
/**
* bch2_journal_reclaim_fast - do the fast part of journal reclaim
*
* Called from IO submission context, does not block. Cleans up after btree
* write completions by advancing the journal pin and each cache's last_idx,
* kicking off discards and background reclaim as necessary.
*/
void bch2_journal_reclaim_fast(struct journal *j)
{
struct journal_entry_pin_list temp;
bool popped = false;
lockdep_assert_held(&j->lock);
/*
* Unpin journal entries whose reference counts reached zero, meaning
* all btree nodes got written out
*/
while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
BUG_ON(!fifo_pop(&j->pin, temp));
popped = true;
}
if (popped)
journal_wake(j);
}
static struct journal_entry_pin *
__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *ret;
u64 iter;
/* no need to iterate over empty fifo entries: */
bch2_journal_reclaim_fast(j);
fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
if (iter > seq_to_flush)
break;
ret = list_first_entry_or_null(&pin_list->list,
struct journal_entry_pin, list);
if (ret) {
/* must be list_del_init(), see bch2_journal_pin_drop() */
list_move(&ret->list, &pin_list->flushed);
*seq = iter;
return ret;
}
}
return NULL;
}
static struct journal_entry_pin *
journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
{
struct journal_entry_pin *ret;
spin_lock(&j->lock);
ret = __journal_get_next_pin(j, seq_to_flush, seq);
spin_unlock(&j->lock);
return ret;
}
static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
{
bool ret;
spin_lock(&j->lock);
ret = ja->nr &&
(ja->last_idx != ja->cur_idx &&
ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
spin_unlock(&j->lock);
return ret;
}
/**
* bch2_journal_reclaim_work - free up journal buckets
*
* Background journal reclaim writes out btree nodes. It should be run
* early enough so that we never completely run out of journal buckets.
*
* High watermarks for triggering background reclaim:
* - FIFO has fewer than 512 entries left
* - fewer than 25% journal buckets free
*
* Background reclaim runs until low watermarks are reached:
* - FIFO has more than 1024 entries left
* - more than 50% journal buckets free
*
* As long as a reclaim can complete in the time it takes to fill up
* 512 journal entries or 25% of all journal buckets, then
* journal_next_bucket() should not stall.
*/
void bch2_journal_reclaim_work(struct work_struct *work)
{
struct bch_fs *c = container_of(to_delayed_work(work),
struct bch_fs, journal.reclaim_work);
struct journal *j = &c->journal;
struct bch_dev *ca;
struct journal_entry_pin *pin;
u64 seq, seq_to_flush = 0;
unsigned iter, bucket_to_flush;
unsigned long next_flush;
bool reclaim_lock_held = false, need_flush;
/*
* Advance last_idx to point to the oldest journal entry containing
* btree node updates that have not yet been written out
*/
for_each_rw_member(ca, c, iter) {
struct journal_device *ja = &ca->journal;
if (!ja->nr)
continue;
while (should_discard_bucket(j, ja)) {
if (!reclaim_lock_held) {
/*
* ugh:
* might be called from __journal_res_get()
* under wait_event() - have to go back to
* TASK_RUNNING before doing something that
* would block, but only if we're doing work:
*/
__set_current_state(TASK_RUNNING);
mutex_lock(&j->reclaim_lock);
reclaim_lock_held = true;
/* recheck under reclaim_lock: */
continue;
}
if (ca->mi.discard &&
blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
blkdev_issue_discard(ca->disk_sb.bdev,
bucket_to_sector(ca,
ja->buckets[ja->last_idx]),
ca->mi.bucket_size, GFP_NOIO, 0);
spin_lock(&j->lock);
ja->last_idx = (ja->last_idx + 1) % ja->nr;
spin_unlock(&j->lock);
journal_wake(j);
}
/*
* Write out enough btree nodes to free up 50% journal
* buckets
*/
spin_lock(&j->lock);
bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
seq_to_flush = max_t(u64, seq_to_flush,
ja->bucket_seq[bucket_to_flush]);
spin_unlock(&j->lock);
}
if (reclaim_lock_held)
mutex_unlock(&j->reclaim_lock);
/* Also flush if the pin fifo is more than half full */
spin_lock(&j->lock);
seq_to_flush = max_t(s64, seq_to_flush,
(s64) journal_cur_seq(j) -
(j->pin.size >> 1));
spin_unlock(&j->lock);
/*
* If it's been longer than j->reclaim_delay_ms since we last flushed,
* make sure to flush at least one journal pin:
*/
next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
need_flush = time_after(jiffies, next_flush);
while ((pin = journal_get_next_pin(j, need_flush
? U64_MAX
: seq_to_flush, &seq))) {
__set_current_state(TASK_RUNNING);
pin->flush(j, pin, seq);
need_flush = false;
j->last_flushed = jiffies;
}
if (!test_bit(BCH_FS_RO, &c->flags))
queue_delayed_work(system_freezable_wq, &j->reclaim_work,
msecs_to_jiffies(j->reclaim_delay_ms));
}
static int journal_flush_done(struct journal *j, u64 seq_to_flush,
struct journal_entry_pin **pin,
u64 *pin_seq)
{
int ret;
*pin = NULL;
ret = bch2_journal_error(j);
if (ret)
return ret;
spin_lock(&j->lock);
/*
* If journal replay hasn't completed, the unreplayed journal entries
* hold refs on their corresponding sequence numbers
*/
ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
!test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
journal_last_seq(j) > seq_to_flush ||
(fifo_used(&j->pin) == 1 &&
atomic_read(&fifo_peek_front(&j->pin).count) == 1);
spin_unlock(&j->lock);
return ret;
}
int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_entry_pin *pin;
u64 pin_seq;
bool flush;
if (!test_bit(JOURNAL_STARTED, &j->flags))
return 0;
again:
wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq));
if (pin) {
/* flushing a journal pin might cause a new one to be added: */
pin->flush(j, pin, pin_seq);
goto again;
}
spin_lock(&j->lock);
flush = journal_last_seq(j) != j->last_seq_ondisk ||
(seq_to_flush == U64_MAX && c->btree_roots_dirty);
spin_unlock(&j->lock);
return flush ? bch2_journal_meta(j) : 0;
}
int bch2_journal_flush_all_pins(struct journal *j)
{
return bch2_journal_flush_pins(j, U64_MAX);
}
int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_entry_pin_list *p;
struct bch_devs_list devs;
u64 iter, seq = 0;
int ret = 0;
spin_lock(&j->lock);
fifo_for_each_entry_ptr(p, &j->pin, iter)
if (dev_idx >= 0
? bch2_dev_list_has_dev(p->devs, dev_idx)
: p->devs.nr < c->opts.metadata_replicas)
seq = iter;
spin_unlock(&j->lock);
ret = bch2_journal_flush_pins(j, seq);
if (ret)
return ret;
mutex_lock(&c->replicas_gc_lock);
bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
seq = 0;
spin_lock(&j->lock);
while (!ret && seq < j->pin.back) {
seq = max(seq, journal_last_seq(j));
devs = journal_seq_pin(j, seq)->devs;
seq++;
spin_unlock(&j->lock);
ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
spin_lock(&j->lock);
}
spin_unlock(&j->lock);
bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
return ret;
}

View File

@ -0,0 +1,36 @@
#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
#define _BCACHEFS_JOURNAL_RECLAIM_H
#define JOURNAL_PIN (32 * 1024)
static inline bool journal_pin_active(struct journal_entry_pin *pin)
{
return pin->pin_list != NULL;
}
static inline struct journal_entry_pin_list *
journal_seq_pin(struct journal *j, u64 seq)
{
BUG_ON(seq < j->pin.front || seq >= j->pin.back);
return &j->pin.data[seq & j->pin.mask];
}
u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
journal_pin_flush_fn);
void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
void bch2_journal_pin_add_if_older(struct journal *,
struct journal_entry_pin *,
struct journal_entry_pin *,
journal_pin_flush_fn);
void bch2_journal_reclaim_fast(struct journal *);
void bch2_journal_reclaim_work(struct work_struct *);
int bch2_journal_flush_pins(struct journal *, u64);
int bch2_journal_flush_all_pins(struct journal *);
int bch2_journal_flush_device_pins(struct journal *, int);
#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */

View File

@ -0,0 +1,358 @@
#include "bcachefs.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "error.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
/*
* journal_seq_blacklist machinery:
*
* To guarantee order of btree updates after a crash, we need to detect when a
* btree node entry (bset) is newer than the newest journal entry that was
* successfully written, and ignore it - effectively ignoring any btree updates
* that didn't make it into the journal.
*
* If we didn't do this, we might have two btree nodes, a and b, both with
* updates that weren't written to the journal yet: if b was updated after a,
* but b was flushed and not a - oops; on recovery we'll find that the updates
* to b happened, but not the updates to a that happened before it.
*
* Ignoring bsets that are newer than the newest journal entry is always safe,
* because everything they contain will also have been journalled - and must
* still be present in the journal on disk until a journal entry has been
* written _after_ that bset was written.
*
* To accomplish this, bsets record the newest journal sequence number they
* contain updates for; then, on startup, the btree code queries the journal
* code to ask "Is this sequence number newer than the newest journal entry? If
* so, ignore it."
*
* When this happens, we must blacklist that journal sequence number: the
* journal must not write any entries with that sequence number, and it must
* record that it was blacklisted so that a) on recovery we don't think we have
* missing journal entries and b) so that the btree code continues to ignore
* that bset, until that btree node is rewritten.
*
* Blacklisted journal sequence numbers are themselves recorded as entries in
* the journal.
*/
/*
* Called when journal needs to evict a blacklist entry to reclaim space: find
* any btree nodes that refer to the blacklist journal sequence numbers, and
* rewrite them:
*/
static void journal_seq_blacklist_flush(struct journal *j,
struct journal_entry_pin *pin, u64 seq)
{
struct bch_fs *c =
container_of(j, struct bch_fs, journal);
struct journal_seq_blacklist *bl =
container_of(pin, struct journal_seq_blacklist, pin);
struct blacklisted_node n;
struct closure cl;
unsigned i;
int ret;
closure_init_stack(&cl);
for (i = 0;; i++) {
struct btree_iter iter;
struct btree *b;
mutex_lock(&j->blacklist_lock);
if (i >= bl->nr_entries) {
mutex_unlock(&j->blacklist_lock);
break;
}
n = bl->entries[i];
mutex_unlock(&j->blacklist_lock);
__bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, 0, 0, 0);
b = bch2_btree_iter_peek_node(&iter);
/* The node might have already been rewritten: */
if (b->data->keys.seq == n.seq) {
ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0);
if (ret) {
bch2_btree_iter_unlock(&iter);
bch2_fs_fatal_error(c,
"error %i rewriting btree node with blacklisted journal seq",
ret);
bch2_journal_halt(j);
return;
}
}
bch2_btree_iter_unlock(&iter);
}
for (i = 0;; i++) {
struct btree_update *as;
struct pending_btree_node_free *d;
mutex_lock(&j->blacklist_lock);
if (i >= bl->nr_entries) {
mutex_unlock(&j->blacklist_lock);
break;
}
n = bl->entries[i];
mutex_unlock(&j->blacklist_lock);
redo_wait:
mutex_lock(&c->btree_interior_update_lock);
/*
* Is the node on the list of pending interior node updates -
* being freed? If so, wait for that to finish:
*/
for_each_pending_btree_node_free(c, as, d)
if (n.seq == d->seq &&
n.btree_id == d->btree_id &&
!d->level &&
!bkey_cmp(n.pos, d->key.k.p)) {
closure_wait(&as->wait, &cl);
mutex_unlock(&c->btree_interior_update_lock);
closure_sync(&cl);
goto redo_wait;
}
mutex_unlock(&c->btree_interior_update_lock);
}
mutex_lock(&j->blacklist_lock);
bch2_journal_pin_drop(j, &bl->pin);
list_del(&bl->list);
kfree(bl->entries);
kfree(bl);
mutex_unlock(&j->blacklist_lock);
}
/*
* Determine if a particular sequence number is blacklisted - if so, return
* blacklist entry:
*/
struct journal_seq_blacklist *
bch2_journal_seq_blacklist_find(struct journal *j, u64 seq)
{
struct journal_seq_blacklist *bl;
lockdep_assert_held(&j->blacklist_lock);
list_for_each_entry(bl, &j->seq_blacklist, list)
if (seq >= bl->start && seq <= bl->end)
return bl;
return NULL;
}
/*
* Allocate a new, in memory blacklist entry:
*/
static struct journal_seq_blacklist *
bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end)
{
struct journal_seq_blacklist *bl;
lockdep_assert_held(&j->blacklist_lock);
/*
* When we start the journal, bch2_journal_start() will skip over @seq:
*/
bl = kzalloc(sizeof(*bl), GFP_KERNEL);
if (!bl)
return NULL;
bl->start = start;
bl->end = end;
list_add_tail(&bl->list, &j->seq_blacklist);
return bl;
}
/*
* Returns true if @seq is newer than the most recent journal entry that got
* written, and data corresponding to @seq should be ignored - also marks @seq
* as blacklisted so that on future restarts the corresponding data will still
* be ignored:
*/
int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
{
struct journal *j = &c->journal;
struct journal_seq_blacklist *bl = NULL;
struct blacklisted_node *n;
u64 journal_seq;
int ret = 0;
if (!seq)
return 0;
spin_lock(&j->lock);
journal_seq = journal_cur_seq(j);
spin_unlock(&j->lock);
/* Interier updates aren't journalled: */
BUG_ON(b->level);
BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
/*
* Decrease this back to j->seq + 2 when we next rev the on disk format:
* increasing it temporarily to work around bug in old kernels
*/
fsck_err_on(seq > journal_seq + 4, c,
"bset journal seq too far in the future: %llu > %llu",
seq, journal_seq);
if (seq <= journal_seq &&
list_empty_careful(&j->seq_blacklist))
return 0;
mutex_lock(&j->blacklist_lock);
if (seq <= journal_seq) {
bl = bch2_journal_seq_blacklist_find(j, seq);
if (!bl)
goto out;
} else {
bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
if (!j->new_blacklist) {
j->new_blacklist = bch2_journal_seq_blacklisted_new(j,
journal_seq + 1,
journal_seq + 1);
if (!j->new_blacklist) {
ret = -ENOMEM;
goto out;
}
}
bl = j->new_blacklist;
bl->end = max(bl->end, seq);
}
for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
if (b->data->keys.seq == n->seq &&
b->btree_id == n->btree_id &&
!bkey_cmp(b->key.k.p, n->pos))
goto found_entry;
if (!bl->nr_entries ||
is_power_of_2(bl->nr_entries)) {
n = krealloc(bl->entries,
max(bl->nr_entries * 2, 8UL) * sizeof(*n),
GFP_KERNEL);
if (!n) {
ret = -ENOMEM;
goto out;
}
bl->entries = n;
}
bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
.seq = b->data->keys.seq,
.btree_id = b->btree_id,
.pos = b->key.k.p,
};
found_entry:
ret = 1;
out:
fsck_err:
mutex_unlock(&j->blacklist_lock);
return ret;
}
static int __bch2_journal_seq_blacklist_read(struct journal *j,
struct journal_replay *i,
u64 start, u64 end)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_seq_blacklist *bl;
bch_verbose(c, "blacklisting existing journal seq %llu-%llu",
start, end);
bl = bch2_journal_seq_blacklisted_new(j, start, end);
if (!bl)
return -ENOMEM;
bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin,
journal_seq_blacklist_flush);
return 0;
}
/*
* After reading the journal, find existing journal seq blacklist entries and
* read them into memory:
*/
int bch2_journal_seq_blacklist_read(struct journal *j,
struct journal_replay *i)
{
struct jset_entry *entry;
int ret = 0;
vstruct_for_each(&i->j, entry) {
switch (entry->type) {
case BCH_JSET_ENTRY_blacklist: {
struct jset_entry_blacklist *bl_entry =
container_of(entry, struct jset_entry_blacklist, entry);
ret = __bch2_journal_seq_blacklist_read(j, i,
le64_to_cpu(bl_entry->seq),
le64_to_cpu(bl_entry->seq));
break;
}
case BCH_JSET_ENTRY_blacklist_v2: {
struct jset_entry_blacklist_v2 *bl_entry =
container_of(entry, struct jset_entry_blacklist_v2, entry);
ret = __bch2_journal_seq_blacklist_read(j, i,
le64_to_cpu(bl_entry->start),
le64_to_cpu(bl_entry->end));
break;
}
}
if (ret)
break;
}
return ret;
}
/*
* After reading the journal and walking the btree, we might have new journal
* sequence numbers to blacklist - add entries to the next journal entry to be
* written:
*/
void bch2_journal_seq_blacklist_write(struct journal *j)
{
struct journal_seq_blacklist *bl = j->new_blacklist;
struct jset_entry_blacklist_v2 *bl_entry;
struct jset_entry *entry;
if (!bl)
return;
entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j),
(sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64));
bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
bl_entry->entry.type = BCH_JSET_ENTRY_blacklist_v2;
bl_entry->start = cpu_to_le64(bl->start);
bl_entry->end = cpu_to_le64(bl->end);
bch2_journal_pin_add(j,
journal_cur_seq(j),
&bl->pin,
journal_seq_blacklist_flush);
j->new_blacklist = NULL;
}

View File

@ -0,0 +1,13 @@
#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
struct journal_replay;
struct journal_seq_blacklist *
bch2_journal_seq_blacklist_find(struct journal *, u64);
int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
int bch2_journal_seq_blacklist_read(struct journal *,
struct journal_replay *);
void bch2_journal_seq_blacklist_write(struct journal *);
#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */

View File

@ -59,8 +59,9 @@ struct blacklisted_node {
struct journal_seq_blacklist {
struct list_head list;
u64 seq;
bool written;
u64 start;
u64 end;
struct journal_entry_pin pin;
struct blacklisted_node *entries;
@ -171,10 +172,11 @@ struct journal {
u64 front, back, size, mask;
struct journal_entry_pin_list *data;
} pin;
struct journal_entry_pin_list *replay_pin_list;
u64 replay_journal_seq;
struct mutex blacklist_lock;
struct list_head seq_blacklist;
struct journal_seq_blacklist *new_blacklist;
BKEY_PADDED(key);
struct write_point wp;

View File

@ -5,6 +5,7 @@
#include "buckets.h"
#include "inode.h"
#include "io.h"
#include "journal_reclaim.h"
#include "move.h"
#include "replicas.h"
#include "super-io.h"
@ -22,7 +23,6 @@ struct moving_io {
struct closure cl;
bool read_completed;
unsigned read_dev;
unsigned read_sectors;
unsigned write_sectors;
@ -42,7 +42,7 @@ struct moving_context {
struct list_head reads;
/* in flight sectors: */
atomic_t read_sectors[BCH_SB_MEMBERS_MAX];
atomic_t read_sectors;
atomic_t write_sectors;
wait_queue_head_t wait;
@ -306,7 +306,8 @@ static void move_write(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
if (likely(!io->rbio.bio.bi_status)) {
if (likely(!io->rbio.bio.bi_status &&
!io->rbio.hole)) {
bch2_migrate_read_done(&io->write, &io->rbio);
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
@ -330,7 +331,7 @@ static void move_read_endio(struct bio *bio)
struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
struct moving_context *ctxt = io->write.ctxt;
atomic_sub(io->read_sectors, &ctxt->read_sectors[io->read_dev]);
atomic_sub(io->read_sectors, &ctxt->read_sectors);
io->read_completed = true;
if (next_pending_write(ctxt))
@ -376,7 +377,6 @@ static int bch2_move_extent(struct bch_fs *c,
enum data_cmd data_cmd,
struct data_opts data_opts)
{
struct extent_pick_ptr pick;
struct moving_io *io;
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
@ -387,12 +387,8 @@ static int bch2_move_extent(struct bch_fs *c,
atomic_read(&ctxt->write_sectors) <
SECTORS_IN_FLIGHT_PER_DEVICE);
bch2_extent_pick_ptr(c, e.s_c, NULL, &pick);
if (IS_ERR_OR_NULL(pick.ca))
return pick.ca ? PTR_ERR(pick.ca) : 0;
move_ctxt_wait_event(ctxt,
atomic_read(&ctxt->read_sectors[pick.ca->dev_idx]) <
atomic_read(&ctxt->read_sectors) <
SECTORS_IN_FLIGHT_PER_DEVICE);
/* write path might have to decompress data: */
@ -406,8 +402,7 @@ static int bch2_move_extent(struct bch_fs *c,
goto err;
io->write.ctxt = ctxt;
io->read_dev = pick.ca->dev_idx;
io->read_sectors = pick.crc.uncompressed_size;
io->read_sectors = e.k->size;
io->write_sectors = e.k->size;
bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
@ -421,6 +416,7 @@ static int bch2_move_extent(struct bch_fs *c,
io->rbio.opts = io_opts;
bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
io->rbio.bio.bi_vcnt = pages;
bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
io->rbio.bio.bi_iter.bi_size = sectors << 9;
@ -438,7 +434,7 @@ static int bch2_move_extent(struct bch_fs *c,
trace_move_extent(e.k);
atomic_add(io->read_sectors, &ctxt->read_sectors[io->read_dev]);
atomic_add(io->read_sectors, &ctxt->read_sectors);
list_add_tail(&io->list, &ctxt->reads);
/*
@ -446,14 +442,15 @@ static int bch2_move_extent(struct bch_fs *c,
* ctxt when doing wakeup
*/
closure_get(&ctxt->cl);
bch2_read_extent(c, &io->rbio, e, &pick, BCH_READ_NODECODE);
bch2_read_extent(c, &io->rbio, e.s_c,
BCH_READ_NODECODE|
BCH_READ_LAST_FRAGMENT);
return 0;
err_free_pages:
bio_free_pages(&io->write.op.wbio.bio);
err_free:
kfree(io);
err:
percpu_ref_put(&pick.ca->io_ref);
trace_move_alloc_fail(e.k);
return ret;
}
@ -728,7 +725,7 @@ int bch2_data_job(struct bch_fs *c,
switch (op.op) {
case BCH_DATA_OP_REREPLICATE:
stats->data_type = BCH_DATA_JOURNAL;
ret = bch2_journal_flush_device(&c->journal, -1);
ret = bch2_journal_flush_device_pins(&c->journal, -1);
ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
ret = bch2_gc_btree_replicas(c) ?: ret;
@ -745,7 +742,7 @@ int bch2_data_job(struct bch_fs *c,
return -EINVAL;
stats->data_type = BCH_DATA_JOURNAL;
ret = bch2_journal_flush_device(&c->journal, op.migrate.dev);
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
ret = bch2_gc_btree_replicas(c) ?: ret;

View File

@ -26,6 +26,8 @@
#include "inode.h"
#include "io.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "keylist.h"
#include "move.h"
#include "migrate.h"
@ -396,9 +398,15 @@ err:
static void bch2_fs_free(struct bch_fs *c)
{
#define BCH_TIME_STAT(name) \
bch2_time_stats_exit(&c->name##_time);
BCH_TIME_STATS()
#undef BCH_TIME_STAT
bch2_fs_quota_exit(c);
bch2_fs_fsio_exit(c);
bch2_fs_encryption_exit(c);
bch2_fs_io_exit(c);
bch2_fs_btree_cache_exit(c);
bch2_fs_journal_exit(&c->journal);
bch2_io_clock_exit(&c->io_clock[WRITE]);
@ -407,10 +415,6 @@ static void bch2_fs_free(struct bch_fs *c)
lg_lock_free(&c->usage_lock);
free_percpu(c->usage_percpu);
mempool_exit(&c->btree_bounce_pool);
mempool_exit(&c->bio_bounce_pages);
bioset_exit(&c->bio_write);
bioset_exit(&c->bio_read_split);
bioset_exit(&c->bio_read);
bioset_exit(&c->btree_bio);
mempool_exit(&c->btree_interior_update_pool);
mempool_exit(&c->btree_reserve_pool);
@ -561,8 +565,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
init_rwsem(&c->gc_lock);
#define BCH_TIME_STAT(name, frequency_units, duration_units) \
spin_lock_init(&c->name##_time.lock);
#define BCH_TIME_STAT(name) \
bch2_time_stats_init(&c->name##_time);
BCH_TIME_STATS()
#undef BCH_TIME_STAT
@ -590,6 +594,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->copy_gc_enabled = 1;
c->rebalance_enabled = 1;
c->rebalance_percent = 10;
c->promote_whole_extents = true;
c->journal.write_time = &c->journal_write_time;
c->journal.delay_time = &c->journal_delay_time;
@ -640,17 +645,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
max(offsetof(struct btree_read_bio, bio),
offsetof(struct btree_write_bio, wbio.bio)),
BIOSET_NEED_BVECS) ||
bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
BIOSET_NEED_BVECS) ||
bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
BIOSET_NEED_BVECS) ||
bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
BIOSET_NEED_BVECS) ||
mempool_init_page_pool(&c->bio_bounce_pages,
max_t(unsigned,
c->opts.btree_node_size,
c->sb.encoded_extent_max) /
PAGE_SECTORS, 0) ||
!(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
lg_lock_init(&c->usage_lock) ||
mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
@ -658,6 +652,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_io_clock_init(&c->io_clock[WRITE]) ||
bch2_fs_journal_init(&c->journal) ||
bch2_fs_btree_cache_init(c) ||
bch2_fs_io_init(c) ||
bch2_fs_encryption_init(c) ||
bch2_fs_compress_init(c) ||
bch2_fs_fsio_init(c))
@ -774,11 +769,11 @@ const char *bch2_fs_start(struct bch_fs *c)
goto recovery_done;
/*
* bch2_journal_start() can't happen sooner, or btree_gc_finish()
* bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
* will give spurious errors about oldest_gen > bucket_gen -
* this is a hack but oh well.
*/
bch2_journal_start(c);
bch2_fs_journal_start(&c->journal);
err = "error starting allocator";
if (bch2_fs_allocator_start(c))
@ -834,7 +829,7 @@ const char *bch2_fs_start(struct bch_fs *c)
* journal_res_get() will crash if called before this has
* set up the journal.pin FIFO and journal.cur pointer:
*/
bch2_journal_start(c);
bch2_fs_journal_start(&c->journal);
bch2_journal_set_replay_done(&c->journal);
err = "error starting allocator";
@ -993,6 +988,9 @@ static void bch2_dev_free(struct bch_dev *ca)
bioset_exit(&ca->replica_set);
bch2_dev_buckets_free(ca);
bch2_time_stats_exit(&ca->io_latency[WRITE]);
bch2_time_stats_exit(&ca->io_latency[READ]);
percpu_ref_exit(&ca->io_ref);
percpu_ref_exit(&ca->ref);
kobject_put(&ca->kobj);
@ -1089,6 +1087,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
bch2_time_stats_init(&ca->io_latency[READ]);
bch2_time_stats_init(&ca->io_latency[WRITE]);
ca->mi = bch2_mi_to_cpu(member);
ca->uuid = member->uuid;
@ -1421,7 +1422,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
goto err;
}
ret = bch2_journal_flush_device(&c->journal, ca->dev_idx);
ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
if (ret) {
bch_err(ca, "Remove failed: error %i flushing journal", ret);
goto err;

View File

@ -27,7 +27,26 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
static inline bool bch2_dev_is_online(struct bch_dev *ca)
{
return ca->disk_sb.bdev != NULL;
return !percpu_ref_is_zero(&ca->io_ref);
}
static inline bool bch2_dev_is_readable(struct bch_dev *ca)
{
return bch2_dev_is_online(ca) &&
ca->mi.state != BCH_MEMBER_STATE_FAILED;
}
static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
{
if (!percpu_ref_tryget(&ca->io_ref))
return false;
if (ca->mi.state == BCH_MEMBER_STATE_RW ||
(ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ))
return true;
percpu_ref_put(&ca->io_ref);
return false;
}
static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)

View File

@ -141,11 +141,19 @@ read_attribute(btree_node_size);
read_attribute(first_bucket);
read_attribute(nbuckets);
read_attribute(durability);
read_attribute(iostats);
read_attribute(last_read_quantiles);
read_attribute(last_write_quantiles);
read_attribute(fragmentation_quantiles);
read_attribute(oldest_gen_quantiles);
read_attribute(iodone);
read_attribute(io_latency_read);
read_attribute(io_latency_write);
read_attribute(io_latency_stats_read);
read_attribute(io_latency_stats_write);
read_attribute(congested);
read_attribute(bucket_quantiles_last_read);
read_attribute(bucket_quantiles_last_write);
read_attribute(bucket_quantiles_fragmentation);
read_attribute(bucket_quantiles_oldest_gen);
read_attribute(reserve_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
@ -177,6 +185,7 @@ sysfs_pd_controller_attribute(copy_gc);
rw_attribute(rebalance_enabled);
rw_attribute(rebalance_percent);
sysfs_pd_controller_attribute(rebalance);
rw_attribute(promote_whole_extents);
rw_attribute(pd_controllers_update_seconds);
@ -189,8 +198,9 @@ read_attribute(data_replicas_have);
BCH_DEBUG_PARAMS()
#undef BCH_DEBUG_PARAM
#define BCH_TIME_STAT(name, frequency_units, duration_units) \
sysfs_time_stats_attribute(name, frequency_units, duration_units);
#define BCH_TIME_STAT(_name) \
static struct attribute sysfs_time_stat_##_name = \
{ .name = #_name, .mode = S_IRUGO };
BCH_TIME_STATS()
#undef BCH_TIME_STAT
@ -332,9 +342,10 @@ SHOW(bch2_fs)
sysfs_printf(rebalance_enabled, "%i", c->rebalance_enabled);
sysfs_print(rebalance_percent, c->rebalance_percent);
sysfs_pd_controller_show(rebalance, &c->rebalance_pd); /* XXX */
sysfs_print(promote_whole_extents, c->promote_whole_extents);
sysfs_printf(meta_replicas_have, "%u", bch2_replicas_online(c, true));
sysfs_printf(data_replicas_have, "%u", bch2_replicas_online(c, false));
@ -406,6 +417,8 @@ STORE(__bch2_fs)
sysfs_strtoul(rebalance_percent, c->rebalance_percent);
sysfs_pd_controller_store(rebalance, &c->rebalance_pd);
sysfs_strtoul(promote_whole_extents, c->promote_whole_extents);
/* Debugging: */
#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name);
@ -462,6 +475,7 @@ struct attribute *bch2_fs_files[] = {
&sysfs_journal_reclaim_delay_ms,
&sysfs_rebalance_percent,
&sysfs_promote_whole_extents,
&sysfs_compression_stats,
NULL
@ -531,9 +545,16 @@ STORE(bch2_fs_opts_dir)
struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
const struct bch_option *opt = container_of(attr, struct bch_option, attr);
int ret, id = opt - bch2_opt_table;
char *tmp;
u64 v;
ret = bch2_opt_parse(c, opt, buf, &v);
tmp = kstrdup(buf, GFP_KERNEL);
if (!tmp)
return -ENOMEM;
ret = bch2_opt_parse(c, opt, strim(tmp), &v);
kfree(tmp);
if (ret < 0)
return ret;
@ -592,9 +613,9 @@ SHOW(bch2_fs_time_stats)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
#define BCH_TIME_STAT(name, frequency_units, duration_units) \
sysfs_print_time_stats(&c->name##_time, name, \
frequency_units, duration_units);
#define BCH_TIME_STAT(name) \
if (attr == &sysfs_time_stat_##name) \
return bch2_time_stats_print(&c->name##_time, buf, PAGE_SIZE);
BCH_TIME_STATS()
#undef BCH_TIME_STAT
@ -603,23 +624,15 @@ SHOW(bch2_fs_time_stats)
STORE(bch2_fs_time_stats)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
#define BCH_TIME_STAT(name, frequency_units, duration_units) \
sysfs_clear_time_stats(&c->name##_time, name);
BCH_TIME_STATS()
#undef BCH_TIME_STAT
return size;
}
SYSFS_OPS(bch2_fs_time_stats);
struct attribute *bch2_fs_time_stats_files[] = {
#define BCH_TIME_STAT(name, frequency_units, duration_units) \
sysfs_time_stats_attribute_list(name, frequency_units, duration_units)
#define BCH_TIME_STAT(name) \
&sysfs_time_stat_##name,
BCH_TIME_STATS()
#undef BCH_TIME_STAT
NULL
};
@ -774,7 +787,7 @@ static const char * const bch2_rw[] = {
NULL
};
static ssize_t show_dev_iostats(struct bch_dev *ca, char *buf)
static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf)
{
char *out = buf, *end = buf + PAGE_SIZE;
int rw, i, cpu;
@ -851,16 +864,28 @@ SHOW(bch2_dev)
return out - buf;
}
if (attr == &sysfs_iostats)
return show_dev_iostats(ca, buf);
if (attr == &sysfs_iodone)
return show_dev_iodone(ca, buf);
if (attr == &sysfs_last_read_quantiles)
sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ]));
sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE]));
if (attr == &sysfs_io_latency_stats_read)
return bch2_time_stats_print(&ca->io_latency[READ], buf, PAGE_SIZE);
if (attr == &sysfs_io_latency_stats_write)
return bch2_time_stats_print(&ca->io_latency[WRITE], buf, PAGE_SIZE);
sysfs_printf(congested, "%u%%",
clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
* 100 / CONGESTED_MAX);
if (attr == &sysfs_bucket_quantiles_last_read)
return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0);
if (attr == &sysfs_last_write_quantiles)
if (attr == &sysfs_bucket_quantiles_last_write)
return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1);
if (attr == &sysfs_fragmentation_quantiles)
if (attr == &sysfs_bucket_quantiles_fragmentation)
return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL);
if (attr == &sysfs_oldest_gen_quantiles)
if (attr == &sysfs_bucket_quantiles_oldest_gen)
return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL);
if (attr == &sysfs_reserve_stats)
@ -944,13 +969,20 @@ struct attribute *bch2_dev_files[] = {
&sysfs_label,
&sysfs_has_data,
&sysfs_iostats,
&sysfs_iodone,
&sysfs_io_latency_read,
&sysfs_io_latency_write,
&sysfs_io_latency_stats_read,
&sysfs_io_latency_stats_write,
&sysfs_congested,
/* alloc info - other stats: */
&sysfs_last_read_quantiles,
&sysfs_last_write_quantiles,
&sysfs_fragmentation_quantiles,
&sysfs_oldest_gen_quantiles,
&sysfs_bucket_quantiles_last_read,
&sysfs_bucket_quantiles_last_write,
&sysfs_bucket_quantiles_fragmentation,
&sysfs_bucket_quantiles_oldest_gen,
&sysfs_reserve_stats,
/* debug: */

View File

@ -13,12 +13,15 @@
#include <linux/kthread.h>
#include <linux/log2.h>
#include <linux/math64.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/random.h>
#include <linux/seq_file.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/sched/clock.h>
#include "eytzinger.h"
#include "util.h"
#define simple_strtoint(c, end, base) simple_strtol(c, end, base)
@ -200,59 +203,189 @@ bool bch2_is_zero(const void *_p, size_t n)
return true;
}
void bch2_time_stats_clear(struct time_stats *stats)
void bch2_quantiles_update(struct quantiles *q, u64 v)
{
spin_lock(&stats->lock);
unsigned i = 0;
stats->count = 0;
stats->last_duration = 0;
stats->max_duration = 0;
stats->average_duration = 0;
stats->average_frequency = 0;
stats->last = 0;
while (i < ARRAY_SIZE(q->entries)) {
struct quantile_entry *e = q->entries + i;
spin_unlock(&stats->lock);
if (unlikely(!e->step)) {
e->m = v;
e->step = max_t(unsigned, v / 2, 1024);
} else if (e->m > v) {
e->m = e->m >= e->step
? e->m - e->step
: 0;
} else if (e->m < v) {
e->m = e->m + e->step > e->m
? e->m + e->step
: U32_MAX;
}
void __bch2_time_stats_update(struct time_stats *stats, u64 start_time)
if ((e->m > v ? e->m - v : v - e->m) < e->step)
e->step = max_t(unsigned, e->step / 2, 1);
if (v >= e->m)
break;
i = eytzinger0_child(i, v > e->m);
}
}
/* time stats: */
static void bch2_time_stats_update_one(struct time_stats *stats,
u64 start, u64 end)
{
u64 now, duration, last;
u64 duration, freq;
duration = time_after64(end, start)
? end - start : 0;
freq = time_after64(end, stats->last_event)
? end - stats->last_event : 0;
stats->count++;
now = local_clock();
duration = time_after64(now, start_time)
? now - start_time : 0;
last = time_after64(now, stats->last)
? now - stats->last : 0;
stats->average_duration = stats->average_duration
? ewma_add(stats->average_duration, duration, 6)
: duration;
stats->average_frequency = stats->average_frequency
? ewma_add(stats->average_frequency, freq, 6)
: freq;
stats->last_duration = duration;
stats->max_duration = max(stats->max_duration, duration);
if (stats->last) {
stats->average_duration = ewma_add(stats->average_duration,
duration << 8, 3);
stats->last_event = end;
if (stats->average_frequency)
stats->average_frequency =
ewma_add(stats->average_frequency,
last << 8, 3);
else
stats->average_frequency = last << 8;
} else {
stats->average_duration = duration << 8;
bch2_quantiles_update(&stats->quantiles, duration);
}
stats->last = now ?: 1;
}
void bch2_time_stats_update(struct time_stats *stats, u64 start_time)
void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end)
{
spin_lock(&stats->lock);
__bch2_time_stats_update(stats, start_time);
spin_unlock(&stats->lock);
unsigned long flags;
if (!stats->buffer) {
spin_lock_irqsave(&stats->lock, flags);
bch2_time_stats_update_one(stats, start, end);
if (stats->average_frequency < 32 &&
stats->count > 1024)
stats->buffer =
alloc_percpu_gfp(struct time_stat_buffer,
GFP_ATOMIC);
spin_unlock_irqrestore(&stats->lock, flags);
} else {
struct time_stat_buffer_entry *i;
struct time_stat_buffer *b;
preempt_disable();
b = this_cpu_ptr(stats->buffer);
BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
b->entries[b->nr++] = (struct time_stat_buffer_entry) {
.start = start,
.end = end
};
if (b->nr == ARRAY_SIZE(b->entries)) {
spin_lock_irqsave(&stats->lock, flags);
for (i = b->entries;
i < b->entries + ARRAY_SIZE(b->entries);
i++)
bch2_time_stats_update_one(stats, i->start, i->end);
spin_unlock_irqrestore(&stats->lock, flags);
b->nr = 0;
}
preempt_enable();
}
}
static const struct time_unit {
const char *name;
u32 nsecs;
} time_units[] = {
{ "ns", 1 },
{ "us", NSEC_PER_USEC },
{ "ms", NSEC_PER_MSEC },
{ "sec", NSEC_PER_SEC },
};
static const struct time_unit *pick_time_units(u64 ns)
{
const struct time_unit *u;
for (u = time_units;
u + 1 < time_units + ARRAY_SIZE(time_units) &&
ns >= u[1].nsecs << 1;
u++)
;
return u;
}
static size_t pr_time_units(char *buf, size_t len, u64 ns)
{
const struct time_unit *u = pick_time_units(ns);
return scnprintf(buf, len, "%llu %s", div_u64(ns, u->nsecs), u->name);
}
size_t bch2_time_stats_print(struct time_stats *stats, char *buf, size_t len)
{
char *out = buf, *end = buf + len;
const struct time_unit *u;
u64 freq = READ_ONCE(stats->average_frequency);
u64 q, last_q = 0;
int i;
out += scnprintf(out, end - out, "count:\t\t%llu\n",
stats->count);
out += scnprintf(out, end - out, "rate:\t\t%llu/sec\n",
freq ? div64_u64(NSEC_PER_SEC, freq) : 0);
out += scnprintf(out, end - out, "frequency:\t");
out += pr_time_units(out, end - out, freq);
out += scnprintf(out, end - out, "\navg duration:\t");
out += pr_time_units(out, end - out, stats->average_duration);
out += scnprintf(out, end - out, "\nmax duration:\t");
out += pr_time_units(out, end - out, stats->max_duration);
i = eytzinger0_first(NR_QUANTILES);
u = pick_time_units(stats->quantiles.entries[i].m);
out += scnprintf(out, end - out, "\nquantiles (%s):\t", u->name);
eytzinger0_for_each(i, NR_QUANTILES) {
bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
q = max(stats->quantiles.entries[i].m, last_q);
out += scnprintf(out, end - out, "%llu%s",
div_u64(q, u->nsecs),
is_last ? "\n" : " ");
last_q = q;
}
return out - buf;
}
void bch2_time_stats_exit(struct time_stats *stats)
{
free_percpu(stats->buffer);
}
void bch2_time_stats_init(struct time_stats *stats)
{
memset(stats, 0, sizeof(*stats));
spin_lock_init(&stats->lock);
}
/* ratelimit: */
/**
* bch2_ratelimit_delay() - return how long to delay until the next time to do
* some work
@ -310,6 +443,8 @@ int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d)
}
}
/* pd controller: */
/*
* Updates pd_controller. Attempts to scale inputed values to units per second.
* @target: desired value
@ -404,6 +539,8 @@ size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
derivative, change, next_io);
}
/* misc: */
void bch2_bio_map(struct bio *bio, void *base)
{
size_t size = bio->bi_iter.bi_size;

View File

@ -371,87 +371,50 @@ ssize_t bch2_read_string_list(const char *, const char * const[]);
ssize_t bch2_scnprint_flag_list(char *, size_t, const char * const[], u64);
u64 bch2_read_flag_list(char *, const char * const[]);
#define NR_QUANTILES 15
#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
struct quantiles {
struct quantile_entry {
u64 m;
u64 step;
} entries[NR_QUANTILES];
};
struct time_stat_buffer {
unsigned nr;
struct time_stat_buffer_entry {
u64 start;
u64 end;
} entries[32];
};
struct time_stats {
spinlock_t lock;
u64 count;
/*
* all fields are in nanoseconds, averages are ewmas stored left shifted
* by 8
*/
u64 last_duration;
u64 max_duration;
/* all fields are in nanoseconds */
u64 average_duration;
u64 average_frequency;
u64 last;
u64 max_duration;
u64 last_event;
struct quantiles quantiles;
struct time_stat_buffer __percpu *buffer;
};
void bch2_time_stats_clear(struct time_stats *stats);
void __bch2_time_stats_update(struct time_stats *stats, u64 time);
void bch2_time_stats_update(struct time_stats *stats, u64 time);
void __bch2_time_stats_update(struct time_stats *stats, u64, u64);
static inline unsigned local_clock_us(void)
static inline void bch2_time_stats_update(struct time_stats *stats, u64 start)
{
return local_clock() >> 10;
__bch2_time_stats_update(stats, start, local_clock());
}
#define NSEC_PER_ns 1L
#define NSEC_PER_us NSEC_PER_USEC
#define NSEC_PER_ms NSEC_PER_MSEC
#define NSEC_PER_sec NSEC_PER_SEC
size_t bch2_time_stats_print(struct time_stats *, char *, size_t);
#define __print_time_stat(stats, name, stat, units) \
sysfs_print(name ## _ ## stat ## _ ## units, \
div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
#define sysfs_print_time_stats(stats, name, \
frequency_units, \
duration_units) \
do { \
__print_time_stat(stats, name, \
average_frequency, frequency_units); \
__print_time_stat(stats, name, \
average_duration, duration_units); \
sysfs_print(name ## _ ##count, (stats)->count); \
sysfs_print(name ## _ ##last_duration ## _ ## duration_units, \
div_u64((stats)->last_duration, \
NSEC_PER_ ## duration_units)); \
sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \
div_u64((stats)->max_duration, \
NSEC_PER_ ## duration_units)); \
\
sysfs_print(name ## _last_ ## frequency_units, (stats)->last \
? div_s64(local_clock() - (stats)->last, \
NSEC_PER_ ## frequency_units) \
: -1LL); \
} while (0)
#define sysfs_clear_time_stats(stats, name) \
do { \
if (attr == &sysfs_ ## name ## _clear) \
bch2_time_stats_clear(stats); \
} while (0)
#define sysfs_time_stats_attribute(name, \
frequency_units, \
duration_units) \
write_attribute(name ## _clear); \
read_attribute(name ## _count); \
read_attribute(name ## _average_frequency_ ## frequency_units); \
read_attribute(name ## _average_duration_ ## duration_units); \
read_attribute(name ## _last_duration_ ## duration_units); \
read_attribute(name ## _max_duration_ ## duration_units); \
read_attribute(name ## _last_ ## frequency_units)
#define sysfs_time_stats_attribute_list(name, \
frequency_units, \
duration_units) \
&sysfs_ ## name ## _clear, \
&sysfs_ ## name ## _count, \
&sysfs_ ## name ## _average_frequency_ ## frequency_units, \
&sysfs_ ## name ## _average_duration_ ## duration_units, \
&sysfs_ ## name ## _last_duration_ ## duration_units, \
&sysfs_ ## name ## _max_duration_ ## duration_units, \
&sysfs_ ## name ## _last_ ## frequency_units,
void bch2_time_stats_exit(struct time_stats *);
void bch2_time_stats_init(struct time_stats *);
#define ewma_add(ewma, val, weight) \
({ \