mirror of
https://github.com/koverstreet/bcachefs-tools.git
synced 2025-02-22 00:00:03 +03:00
Update bcachefs sources to ed4aea2ad4 bcachefs: fix gcc warning
This commit is contained in:
parent
c598d91dcb
commit
018de5aa89
@ -1 +1 @@
|
||||
edf5f38218f699e53913a549465f35d36c4418f7
|
||||
ed4aea2ad4fa1b3891684cbd071d1a1ae9094342
|
||||
|
@ -69,6 +69,7 @@
|
||||
#include "extents.h"
|
||||
#include "io.h"
|
||||
#include "journal.h"
|
||||
#include "journal_io.h"
|
||||
#include "super-io.h"
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
|
@ -271,17 +271,19 @@ do { \
|
||||
#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
|
||||
#endif
|
||||
|
||||
/* name, frequency_units, duration_units */
|
||||
#define BCH_TIME_STATS() \
|
||||
BCH_TIME_STAT(btree_node_mem_alloc, sec, us) \
|
||||
BCH_TIME_STAT(btree_gc, sec, ms) \
|
||||
BCH_TIME_STAT(btree_split, sec, us) \
|
||||
BCH_TIME_STAT(btree_sort, ms, us) \
|
||||
BCH_TIME_STAT(btree_read, ms, us) \
|
||||
BCH_TIME_STAT(journal_write, us, us) \
|
||||
BCH_TIME_STAT(journal_delay, ms, us) \
|
||||
BCH_TIME_STAT(journal_blocked, sec, ms) \
|
||||
BCH_TIME_STAT(journal_flush_seq, us, us)
|
||||
#define BCH_TIME_STATS() \
|
||||
BCH_TIME_STAT(btree_node_mem_alloc) \
|
||||
BCH_TIME_STAT(btree_gc) \
|
||||
BCH_TIME_STAT(btree_split) \
|
||||
BCH_TIME_STAT(btree_sort) \
|
||||
BCH_TIME_STAT(btree_read) \
|
||||
BCH_TIME_STAT(data_write) \
|
||||
BCH_TIME_STAT(data_read) \
|
||||
BCH_TIME_STAT(data_promote) \
|
||||
BCH_TIME_STAT(journal_write) \
|
||||
BCH_TIME_STAT(journal_delay) \
|
||||
BCH_TIME_STAT(journal_blocked) \
|
||||
BCH_TIME_STAT(journal_flush_seq)
|
||||
|
||||
#include "alloc_types.h"
|
||||
#include "buckets_types.h"
|
||||
@ -416,7 +418,12 @@ struct bch_dev {
|
||||
struct work_struct io_error_work;
|
||||
|
||||
/* The rest of this all shows up in sysfs */
|
||||
atomic_t latency[2];
|
||||
atomic64_t cur_latency[2];
|
||||
struct time_stats io_latency[2];
|
||||
|
||||
#define CONGESTED_MAX 1024
|
||||
atomic_t congested;
|
||||
u64 congested_last;
|
||||
|
||||
struct io_count __percpu *io_done;
|
||||
};
|
||||
@ -644,6 +651,7 @@ struct bch_fs {
|
||||
struct bio_set bio_write;
|
||||
struct mutex bio_bounce_pages_lock;
|
||||
mempool_t bio_bounce_pages;
|
||||
struct rhashtable promote_table;
|
||||
|
||||
mempool_t compression_bounce[2];
|
||||
mempool_t compress_workspace[BCH_COMPRESSION_NR];
|
||||
@ -708,12 +716,13 @@ struct bch_fs {
|
||||
unsigned copy_gc_enabled:1;
|
||||
unsigned rebalance_enabled:1;
|
||||
unsigned rebalance_percent;
|
||||
bool promote_whole_extents;
|
||||
|
||||
#define BCH_DEBUG_PARAM(name, description) bool name;
|
||||
BCH_DEBUG_PARAMS_ALL()
|
||||
#undef BCH_DEBUG_PARAM
|
||||
|
||||
#define BCH_TIME_STAT(name, frequency_units, duration_units) \
|
||||
#define BCH_TIME_STAT(name) \
|
||||
struct time_stats name##_time;
|
||||
BCH_TIME_STATS()
|
||||
#undef BCH_TIME_STAT
|
||||
|
@ -1088,13 +1088,14 @@ LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
|
||||
|
||||
LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24);
|
||||
LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28);
|
||||
LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
|
||||
struct bch_sb, flags[1], 28, 32);
|
||||
|
||||
LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40);
|
||||
LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52);
|
||||
LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64);
|
||||
|
||||
LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
|
||||
struct bch_sb, flags[2], 0, 4);
|
||||
|
||||
/* Features: */
|
||||
enum bch_sb_features {
|
||||
BCH_FEATURE_LZ4 = 0,
|
||||
@ -1193,29 +1194,41 @@ struct jset_entry {
|
||||
};
|
||||
};
|
||||
|
||||
#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
|
||||
|
||||
#define BCH_JSET_ENTRY_TYPES() \
|
||||
x(btree_keys, 0) \
|
||||
x(btree_root, 1) \
|
||||
x(prio_ptrs, 2) \
|
||||
x(blacklist, 3) \
|
||||
x(blacklist_v2, 4)
|
||||
|
||||
enum {
|
||||
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
|
||||
BCH_JSET_ENTRY_TYPES()
|
||||
#undef x
|
||||
BCH_JSET_ENTRY_NR
|
||||
};
|
||||
|
||||
/*
|
||||
* Journal sequence numbers can be blacklisted: bsets record the max sequence
|
||||
* number of all the journal entries they contain updates for, so that on
|
||||
* recovery we can ignore those bsets that contain index updates newer that what
|
||||
* made it into the journal.
|
||||
*
|
||||
* This means that we can't reuse that journal_seq - we have to skip it, and
|
||||
* then record that we skipped it so that the next time we crash and recover we
|
||||
* don't think there was a missing journal entry.
|
||||
*/
|
||||
struct jset_entry_blacklist {
|
||||
struct jset_entry entry;
|
||||
__le64 seq;
|
||||
};
|
||||
|
||||
#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
|
||||
|
||||
enum {
|
||||
JOURNAL_ENTRY_BTREE_KEYS = 0,
|
||||
JOURNAL_ENTRY_BTREE_ROOT = 1,
|
||||
JOURNAL_ENTRY_PRIO_PTRS = 2, /* Obsolete */
|
||||
|
||||
/*
|
||||
* Journal sequence numbers can be blacklisted: bsets record the max
|
||||
* sequence number of all the journal entries they contain updates for,
|
||||
* so that on recovery we can ignore those bsets that contain index
|
||||
* updates newer that what made it into the journal.
|
||||
*
|
||||
* This means that we can't reuse that journal_seq - we have to skip it,
|
||||
* and then record that we skipped it so that the next time we crash and
|
||||
* recover we don't think there was a missing journal entry.
|
||||
*/
|
||||
JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED = 3,
|
||||
struct jset_entry_blacklist_v2 {
|
||||
struct jset_entry entry;
|
||||
__le64 start;
|
||||
__le64 end;
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -13,7 +13,8 @@
|
||||
#include "error.h"
|
||||
#include "extents.h"
|
||||
#include "io.h"
|
||||
#include "journal.h"
|
||||
#include "journal_reclaim.h"
|
||||
#include "journal_seq_blacklist.h"
|
||||
#include "super-io.h"
|
||||
|
||||
#include <trace/events/bcachefs.h>
|
||||
@ -947,6 +948,7 @@ enum btree_validate_ret {
|
||||
|
||||
#define btree_err(type, c, b, i, msg, ...) \
|
||||
({ \
|
||||
__label__ out; \
|
||||
char _buf[300], *out = _buf, *end = out + sizeof(_buf); \
|
||||
\
|
||||
out += btree_err_msg(c, b, i, b->written, write, out, end - out);\
|
||||
@ -956,7 +958,11 @@ enum btree_validate_ret {
|
||||
write == READ && \
|
||||
!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \
|
||||
mustfix_fsck_err(c, "%s", _buf); \
|
||||
} else { \
|
||||
goto out; \
|
||||
} \
|
||||
\
|
||||
switch (write) { \
|
||||
case READ: \
|
||||
bch_err(c, "%s", _buf); \
|
||||
\
|
||||
switch (type) { \
|
||||
@ -976,7 +982,17 @@ enum btree_validate_ret {
|
||||
ret = BCH_FSCK_ERRORS_NOT_FIXED; \
|
||||
goto fsck_err; \
|
||||
} \
|
||||
break; \
|
||||
case WRITE: \
|
||||
bch_err(c, "corrupt metadata before write: %s", _buf); \
|
||||
\
|
||||
if (bch2_fs_inconsistent(c)) { \
|
||||
ret = BCH_FSCK_ERRORS_NOT_FIXED; \
|
||||
goto fsck_err; \
|
||||
} \
|
||||
break; \
|
||||
} \
|
||||
out: \
|
||||
true; \
|
||||
})
|
||||
|
||||
@ -1323,37 +1339,48 @@ static void btree_node_read_work(struct work_struct *work)
|
||||
struct btree_read_bio *rb =
|
||||
container_of(work, struct btree_read_bio, work);
|
||||
struct bch_fs *c = rb->c;
|
||||
struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
|
||||
struct btree *b = rb->bio.bi_private;
|
||||
struct bio *bio = &rb->bio;
|
||||
struct bch_devs_mask avoid;
|
||||
bool can_retry;
|
||||
|
||||
memset(&avoid, 0, sizeof(avoid));
|
||||
|
||||
goto start;
|
||||
do {
|
||||
while (1) {
|
||||
bch_info(c, "retrying read");
|
||||
ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
|
||||
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
|
||||
bio_reset(bio);
|
||||
bio_set_dev(bio, rb->pick.ca->disk_sb.bdev);
|
||||
bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
|
||||
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
|
||||
bio->bi_iter.bi_size = btree_bytes(c);
|
||||
submit_bio_wait(bio);
|
||||
start:
|
||||
bch2_dev_io_err_on(bio->bi_status, rb->pick.ca, "btree read");
|
||||
percpu_ref_put(&rb->pick.ca->io_ref);
|
||||
|
||||
__set_bit(rb->pick.ca->dev_idx, avoid.d);
|
||||
rb->pick = bch2_btree_pick_ptr(c, b, &avoid);
|
||||
if (rb->have_ioref) {
|
||||
bio_set_dev(bio, ca->disk_sb.bdev);
|
||||
submit_bio_wait(bio);
|
||||
} else {
|
||||
bio->bi_status = BLK_STS_REMOVED;
|
||||
}
|
||||
start:
|
||||
bch2_dev_io_err_on(bio->bi_status, ca, "btree read");
|
||||
if (rb->have_ioref)
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
rb->have_ioref = false;
|
||||
|
||||
__set_bit(rb->pick.ptr.dev, avoid.d);
|
||||
can_retry = bch2_btree_pick_ptr(c, b, &avoid, &rb->pick) > 0;
|
||||
|
||||
if (!bio->bi_status &&
|
||||
!bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca)))
|
||||
goto out;
|
||||
} while (!IS_ERR_OR_NULL(rb->pick.ca));
|
||||
!bch2_btree_node_read_done(c, b, can_retry))
|
||||
break;
|
||||
|
||||
set_btree_node_read_error(b);
|
||||
out:
|
||||
if (!IS_ERR_OR_NULL(rb->pick.ca))
|
||||
percpu_ref_put(&rb->pick.ca->io_ref);
|
||||
if (!can_retry) {
|
||||
set_btree_node_read_error(b);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
bch2_time_stats_update(&c->btree_read_time, rb->start_time);
|
||||
bio_put(&rb->bio);
|
||||
@ -1365,10 +1392,13 @@ static void btree_node_read_endio(struct bio *bio)
|
||||
{
|
||||
struct btree_read_bio *rb =
|
||||
container_of(bio, struct btree_read_bio, bio);
|
||||
struct bch_fs *c = rb->c;
|
||||
|
||||
bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ);
|
||||
if (rb->have_ioref) {
|
||||
struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
|
||||
bch2_latency_acct(ca, rb->start_time, READ);
|
||||
}
|
||||
|
||||
INIT_WORK(&rb->work, btree_node_read_work);
|
||||
queue_work(system_unbound_wq, &rb->work);
|
||||
}
|
||||
|
||||
@ -1377,41 +1407,58 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
|
||||
{
|
||||
struct extent_pick_ptr pick;
|
||||
struct btree_read_bio *rb;
|
||||
struct bch_dev *ca;
|
||||
struct bio *bio;
|
||||
int ret;
|
||||
|
||||
trace_btree_read(c, b);
|
||||
|
||||
pick = bch2_btree_pick_ptr(c, b, NULL);
|
||||
if (bch2_fs_fatal_err_on(!pick.ca, c,
|
||||
ret = bch2_btree_pick_ptr(c, b, NULL, &pick);
|
||||
if (bch2_fs_fatal_err_on(ret <= 0, c,
|
||||
"btree node read error: no device to read from")) {
|
||||
set_btree_node_read_error(b);
|
||||
return;
|
||||
}
|
||||
|
||||
ca = bch_dev_bkey_exists(c, pick.ptr.dev);
|
||||
|
||||
bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
|
||||
rb = container_of(bio, struct btree_read_bio, bio);
|
||||
rb->c = c;
|
||||
rb->start_time = local_clock();
|
||||
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
|
||||
rb->pick = pick;
|
||||
bio_set_dev(bio, pick.ca->disk_sb.bdev);
|
||||
INIT_WORK(&rb->work, btree_node_read_work);
|
||||
bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
|
||||
bio->bi_iter.bi_sector = pick.ptr.offset;
|
||||
bio->bi_iter.bi_size = btree_bytes(c);
|
||||
bio->bi_end_io = btree_node_read_endio;
|
||||
bio->bi_private = b;
|
||||
bch2_bio_map(bio, b->data);
|
||||
|
||||
this_cpu_add(pick.ca->io_done->sectors[READ][BCH_DATA_BTREE],
|
||||
bio_sectors(bio));
|
||||
|
||||
set_btree_node_read_in_flight(b);
|
||||
|
||||
if (sync) {
|
||||
submit_bio_wait(bio);
|
||||
bio->bi_private = b;
|
||||
btree_node_read_work(&rb->work);
|
||||
if (rb->have_ioref) {
|
||||
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE],
|
||||
bio_sectors(bio));
|
||||
bio_set_dev(bio, ca->disk_sb.bdev);
|
||||
|
||||
if (sync) {
|
||||
submit_bio_wait(bio);
|
||||
|
||||
bio->bi_private = b;
|
||||
btree_node_read_work(&rb->work);
|
||||
} else {
|
||||
submit_bio(bio);
|
||||
}
|
||||
} else {
|
||||
bio->bi_end_io = btree_node_read_endio;
|
||||
bio->bi_private = b;
|
||||
submit_bio(bio);
|
||||
bio->bi_status = BLK_STS_REMOVED;
|
||||
|
||||
if (sync)
|
||||
btree_node_read_work(&rb->work);
|
||||
else
|
||||
queue_work(system_unbound_wq, &rb->work);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -1593,20 +1640,21 @@ static void btree_node_write_endio(struct bio *bio)
|
||||
struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
|
||||
struct bch_write_bio *orig = parent ?: wbio;
|
||||
struct bch_fs *c = wbio->c;
|
||||
struct bch_dev *ca = wbio->ca;
|
||||
struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
|
||||
unsigned long flags;
|
||||
|
||||
bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
|
||||
if (wbio->have_ioref)
|
||||
bch2_latency_acct(ca, wbio->submit_time, WRITE);
|
||||
|
||||
if (bio->bi_status == BLK_STS_REMOVED ||
|
||||
bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
|
||||
bch2_meta_write_fault("btree")) {
|
||||
spin_lock_irqsave(&c->btree_write_error_lock, flags);
|
||||
bch2_dev_list_add_dev(&orig->failed, ca->dev_idx);
|
||||
bch2_dev_list_add_dev(&orig->failed, wbio->dev);
|
||||
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
|
||||
}
|
||||
|
||||
if (wbio->have_io_ref)
|
||||
if (wbio->have_ioref)
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
|
||||
if (parent) {
|
||||
|
@ -12,8 +12,8 @@ struct btree_iter;
|
||||
|
||||
struct btree_read_bio {
|
||||
struct bch_fs *c;
|
||||
unsigned submit_time_us;
|
||||
u64 start_time;
|
||||
unsigned have_ioref:1;
|
||||
struct extent_pick_ptr pick;
|
||||
struct work_struct work;
|
||||
struct bio bio;
|
||||
|
@ -748,7 +748,9 @@ static void btree_iter_prefetch(struct btree_iter *iter)
|
||||
struct btree_node_iter node_iter = l->iter;
|
||||
struct bkey_packed *k;
|
||||
BKEY_PADDED(k) tmp;
|
||||
unsigned nr = iter->level > 1 ? 1 : 8;
|
||||
unsigned nr = test_bit(BCH_FS_STARTED, &iter->c->flags)
|
||||
? (iter->level > 1 ? 0 : 2)
|
||||
: (iter->level > 1 ? 1 : 16);
|
||||
bool was_locked = btree_node_locked(iter, iter->level);
|
||||
|
||||
while (nr) {
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include "buckets.h"
|
||||
#include "extents.h"
|
||||
#include "journal.h"
|
||||
#include "journal_reclaim.h"
|
||||
#include "keylist.h"
|
||||
#include "replicas.h"
|
||||
#include "super-io.h"
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include "debug.h"
|
||||
#include "extents.h"
|
||||
#include "journal.h"
|
||||
#include "journal_reclaim.h"
|
||||
#include "keylist.h"
|
||||
|
||||
#include <linux/sort.h>
|
||||
@ -137,7 +138,7 @@ void bch2_btree_journal_key(struct btree_insert *trans,
|
||||
EBUG_ON(trans->journal_res.ref !=
|
||||
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
|
||||
|
||||
if (likely(trans->journal_res.ref)) {
|
||||
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
|
||||
u64 seq = trans->journal_res.seq;
|
||||
bool needs_whiteout = insert->k.needs_whiteout;
|
||||
|
||||
@ -155,12 +156,16 @@ void bch2_btree_journal_key(struct btree_insert *trans,
|
||||
btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
|
||||
}
|
||||
|
||||
if (unlikely(!journal_pin_active(&w->journal)))
|
||||
bch2_journal_pin_add(j, &trans->journal_res,
|
||||
&w->journal,
|
||||
if (unlikely(!journal_pin_active(&w->journal))) {
|
||||
u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
|
||||
? trans->journal_res.seq
|
||||
: j->replay_journal_seq;
|
||||
|
||||
bch2_journal_pin_add(j, seq, &w->journal,
|
||||
btree_node_write_idx(b) == 0
|
||||
? btree_node_flush0
|
||||
: btree_node_flush1);
|
||||
}
|
||||
|
||||
if (unlikely(!btree_node_dirty(b)))
|
||||
set_btree_node_dirty(b);
|
||||
|
@ -142,7 +142,8 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca,
|
||||
u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
|
||||
|
||||
if (WARN_ONCE(stats.buckets_unavailable > total,
|
||||
"buckets_unavailable overflow\n"))
|
||||
"buckets_unavailable overflow (%llu > %llu)\n",
|
||||
stats.buckets_unavailable, total))
|
||||
return 0;
|
||||
|
||||
return total - stats.buckets_unavailable;
|
||||
|
@ -36,6 +36,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
|
||||
struct btree_node *n_ondisk, *n_sorted, *n_inmemory;
|
||||
struct bset *sorted, *inmemory;
|
||||
struct extent_pick_ptr pick;
|
||||
struct bch_dev *ca;
|
||||
struct bio *bio;
|
||||
|
||||
if (c->opts.nochanges)
|
||||
@ -54,12 +55,15 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
|
||||
v->btree_id = b->btree_id;
|
||||
bch2_btree_keys_init(v, &c->expensive_debug_checks);
|
||||
|
||||
pick = bch2_btree_pick_ptr(c, b, NULL);
|
||||
if (IS_ERR_OR_NULL(pick.ca))
|
||||
if (bch2_btree_pick_ptr(c, b, NULL, &pick) <= 0)
|
||||
return;
|
||||
|
||||
ca = bch_dev_bkey_exists(c, pick.ptr.dev);
|
||||
if (!bch2_dev_get_ioref(ca, READ))
|
||||
return;
|
||||
|
||||
bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
|
||||
bio_set_dev(bio, pick.ca->disk_sb.bdev);
|
||||
bio_set_dev(bio, ca->disk_sb.bdev);
|
||||
bio->bi_opf = REQ_OP_READ|REQ_META;
|
||||
bio->bi_iter.bi_sector = pick.ptr.offset;
|
||||
bio->bi_iter.bi_size = btree_bytes(c);
|
||||
@ -68,7 +72,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
|
||||
submit_bio_wait(bio);
|
||||
|
||||
bio_put(bio);
|
||||
percpu_ref_put(&pick.ca->io_ref);
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
|
||||
memcpy(n_ondisk, n_sorted, btree_bytes(c));
|
||||
|
||||
|
@ -3,20 +3,22 @@
|
||||
#include "io.h"
|
||||
#include "super.h"
|
||||
|
||||
void bch2_inconsistent_error(struct bch_fs *c)
|
||||
bool bch2_inconsistent_error(struct bch_fs *c)
|
||||
{
|
||||
set_bit(BCH_FS_ERROR, &c->flags);
|
||||
|
||||
switch (c->opts.errors) {
|
||||
case BCH_ON_ERROR_CONTINUE:
|
||||
break;
|
||||
return false;
|
||||
case BCH_ON_ERROR_RO:
|
||||
if (bch2_fs_emergency_read_only(c))
|
||||
bch_err(c, "emergency read only");
|
||||
break;
|
||||
return true;
|
||||
case BCH_ON_ERROR_PANIC:
|
||||
panic(bch2_fmt(c, "panic after error"));
|
||||
break;
|
||||
return true;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -45,13 +45,13 @@ do { \
|
||||
* BCH_ON_ERROR_CONTINUE mode
|
||||
*/
|
||||
|
||||
void bch2_inconsistent_error(struct bch_fs *);
|
||||
bool bch2_inconsistent_error(struct bch_fs *);
|
||||
|
||||
#define bch2_fs_inconsistent(c, ...) \
|
||||
do { \
|
||||
({ \
|
||||
bch_err(c, __VA_ARGS__); \
|
||||
bch2_inconsistent_error(c); \
|
||||
} while (0)
|
||||
})
|
||||
|
||||
#define bch2_fs_inconsistent_on(cond, c, ...) \
|
||||
({ \
|
||||
|
@ -588,58 +588,51 @@ out:
|
||||
return out - buf;
|
||||
}
|
||||
|
||||
static inline bool dev_latency_better(struct bch_dev *dev1,
|
||||
struct bch_dev *dev2)
|
||||
static inline bool dev_latency_better(struct bch_fs *c,
|
||||
const struct bch_extent_ptr *ptr1,
|
||||
const struct bch_extent_ptr *ptr2)
|
||||
{
|
||||
unsigned l1 = atomic_read(&dev1->latency[READ]);
|
||||
unsigned l2 = atomic_read(&dev2->latency[READ]);
|
||||
struct bch_dev *dev1 = bch_dev_bkey_exists(c, ptr1->dev);
|
||||
struct bch_dev *dev2 = bch_dev_bkey_exists(c, ptr2->dev);
|
||||
u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
|
||||
u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
|
||||
|
||||
/* Pick at random, biased in favor of the faster device: */
|
||||
|
||||
return bch2_rand_range(l1 + l2) > l1;
|
||||
}
|
||||
|
||||
static void extent_pick_read_device(struct bch_fs *c,
|
||||
struct bkey_s_c_extent e,
|
||||
struct bch_devs_mask *avoid,
|
||||
struct extent_pick_ptr *pick)
|
||||
static int extent_pick_read_device(struct bch_fs *c,
|
||||
struct bkey_s_c_extent e,
|
||||
struct bch_devs_mask *avoid,
|
||||
struct extent_pick_ptr *pick)
|
||||
{
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
struct bch_dev *ca;
|
||||
int ret = 0;
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc) {
|
||||
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
|
||||
ca = bch_dev_bkey_exists(c, ptr->dev);
|
||||
|
||||
if (ptr->cached && ptr_stale(ca, ptr))
|
||||
continue;
|
||||
|
||||
if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
|
||||
if (avoid && test_bit(ptr->dev, avoid->d))
|
||||
continue;
|
||||
|
||||
if (avoid) {
|
||||
if (test_bit(ca->dev_idx, avoid->d))
|
||||
continue;
|
||||
|
||||
if (pick->ca &&
|
||||
test_bit(pick->ca->dev_idx, avoid->d))
|
||||
goto use;
|
||||
}
|
||||
|
||||
if (pick->ca && !dev_latency_better(ca, pick->ca))
|
||||
if (ret && !dev_latency_better(c, ptr, &pick->ptr))
|
||||
continue;
|
||||
use:
|
||||
if (!percpu_ref_tryget(&ca->io_ref))
|
||||
continue;
|
||||
|
||||
if (pick->ca)
|
||||
percpu_ref_put(&pick->ca->io_ref);
|
||||
|
||||
*pick = (struct extent_pick_ptr) {
|
||||
.ptr = *ptr,
|
||||
.crc = crc,
|
||||
.ca = ca,
|
||||
};
|
||||
|
||||
ret = 1;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Btree ptrs */
|
||||
@ -759,16 +752,12 @@ void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
|
||||
#undef p
|
||||
}
|
||||
|
||||
struct extent_pick_ptr
|
||||
bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
|
||||
struct bch_devs_mask *avoid)
|
||||
int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
|
||||
struct bch_devs_mask *avoid,
|
||||
struct extent_pick_ptr *pick)
|
||||
{
|
||||
struct extent_pick_ptr pick = { .ca = NULL };
|
||||
|
||||
extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
|
||||
avoid, &pick);
|
||||
|
||||
return pick;
|
||||
return extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
|
||||
avoid, pick);
|
||||
}
|
||||
|
||||
/* Extents */
|
||||
@ -2057,37 +2046,33 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
|
||||
* Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
|
||||
* other devices, it will still pick a pointer from avoid.
|
||||
*/
|
||||
void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
|
||||
struct bch_devs_mask *avoid,
|
||||
struct extent_pick_ptr *ret)
|
||||
int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
|
||||
struct bch_devs_mask *avoid,
|
||||
struct extent_pick_ptr *pick)
|
||||
{
|
||||
struct bkey_s_c_extent e;
|
||||
int ret;
|
||||
|
||||
switch (k.k->type) {
|
||||
case KEY_TYPE_DELETED:
|
||||
case KEY_TYPE_DISCARD:
|
||||
case KEY_TYPE_COOKIE:
|
||||
ret->ca = NULL;
|
||||
return;
|
||||
return 0;
|
||||
|
||||
case KEY_TYPE_ERROR:
|
||||
ret->ca = ERR_PTR(-EIO);
|
||||
return;
|
||||
return -EIO;
|
||||
|
||||
case BCH_EXTENT:
|
||||
case BCH_EXTENT_CACHED:
|
||||
e = bkey_s_c_to_extent(k);
|
||||
ret->ca = NULL;
|
||||
ret = extent_pick_read_device(c, bkey_s_c_to_extent(k),
|
||||
avoid, pick);
|
||||
|
||||
extent_pick_read_device(c, bkey_s_c_to_extent(k), avoid, ret);
|
||||
if (!ret && !bkey_extent_is_cached(k.k))
|
||||
ret = -EIO;
|
||||
|
||||
if (!ret->ca && !bkey_extent_is_cached(e.k))
|
||||
ret->ca = ERR_PTR(-EIO);
|
||||
return;
|
||||
return ret;
|
||||
|
||||
case BCH_RESERVATION:
|
||||
ret->ca = NULL;
|
||||
return;
|
||||
return 0;
|
||||
|
||||
default:
|
||||
BUG();
|
||||
|
@ -53,13 +53,13 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
|
||||
struct btree *,
|
||||
struct btree_node_iter_large *);
|
||||
|
||||
struct extent_pick_ptr
|
||||
bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
|
||||
struct bch_devs_mask *avoid);
|
||||
int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
|
||||
struct bch_devs_mask *avoid,
|
||||
struct extent_pick_ptr *);
|
||||
|
||||
void bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
|
||||
struct bch_devs_mask *,
|
||||
struct extent_pick_ptr *);
|
||||
int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
|
||||
struct bch_devs_mask *,
|
||||
struct extent_pick_ptr *);
|
||||
|
||||
enum btree_insert_ret
|
||||
bch2_insert_fixup_extent(struct btree_insert *,
|
||||
|
@ -21,7 +21,6 @@ struct bch_extent_crc_unpacked {
|
||||
struct extent_pick_ptr {
|
||||
struct bch_extent_ptr ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
struct bch_dev *ca;
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_EXTENTS_TYPES_H */
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include <linux/migrate.h>
|
||||
#include <linux/mmu_context.h>
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/sched/signal.h>
|
||||
#include <linux/task_io_accounting_ops.h>
|
||||
#include <linux/uio.h>
|
||||
#include <linux/writeback.h>
|
||||
@ -124,13 +125,13 @@ static void bch2_quota_reservation_put(struct bch_fs *c,
|
||||
if (!res->sectors)
|
||||
return;
|
||||
|
||||
mutex_lock(&inode->ei_update_lock);
|
||||
mutex_lock(&inode->ei_quota_lock);
|
||||
BUG_ON(res->sectors > inode->ei_quota_reserved);
|
||||
|
||||
bch2_quota_acct(c, inode->ei_qid, Q_SPC,
|
||||
-((s64) res->sectors), BCH_QUOTA_PREALLOC);
|
||||
inode->ei_quota_reserved -= res->sectors;
|
||||
mutex_unlock(&inode->ei_update_lock);
|
||||
mutex_unlock(&inode->ei_quota_lock);
|
||||
|
||||
res->sectors = 0;
|
||||
}
|
||||
@ -143,14 +144,14 @@ static int bch2_quota_reservation_add(struct bch_fs *c,
|
||||
{
|
||||
int ret;
|
||||
|
||||
mutex_lock(&inode->ei_update_lock);
|
||||
mutex_lock(&inode->ei_quota_lock);
|
||||
ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
|
||||
check_enospc ? BCH_QUOTA_PREALLOC : BCH_QUOTA_NOCHECK);
|
||||
if (likely(!ret)) {
|
||||
inode->ei_quota_reserved += sectors;
|
||||
res->sectors += sectors;
|
||||
}
|
||||
mutex_unlock(&inode->ei_update_lock);
|
||||
mutex_unlock(&inode->ei_quota_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -195,9 +196,10 @@ static int __must_check bch2_write_inode_size(struct bch_fs *c,
|
||||
return __bch2_write_inode(c, inode, inode_set_size, &new_size);
|
||||
}
|
||||
|
||||
static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
|
||||
struct quota_res *quota_res, int sectors)
|
||||
static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
|
||||
struct quota_res *quota_res, int sectors)
|
||||
{
|
||||
mutex_lock(&inode->ei_quota_lock);
|
||||
#ifdef CONFIG_BCACHEFS_QUOTA
|
||||
if (quota_res && sectors > 0) {
|
||||
BUG_ON(sectors > quota_res->sectors);
|
||||
@ -210,14 +212,7 @@ static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
|
||||
}
|
||||
#endif
|
||||
inode->v.i_blocks += sectors;
|
||||
}
|
||||
|
||||
static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
|
||||
struct quota_res *quota_res, int sectors)
|
||||
{
|
||||
mutex_lock(&inode->ei_update_lock);
|
||||
__i_sectors_acct(c, inode, quota_res, sectors);
|
||||
mutex_unlock(&inode->ei_update_lock);
|
||||
mutex_unlock(&inode->ei_quota_lock);
|
||||
}
|
||||
|
||||
/* i_sectors accounting: */
|
||||
@ -265,7 +260,7 @@ static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
|
||||
if (h->new_i_size != U64_MAX)
|
||||
i_size_write(&h->inode->v, h->new_i_size);
|
||||
|
||||
__i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
|
||||
i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
|
||||
|
||||
ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h);
|
||||
mutex_unlock(&h->inode->ei_update_lock);
|
||||
@ -773,6 +768,7 @@ void bch2_invalidatepage(struct page *page, unsigned int offset,
|
||||
|
||||
int bch2_releasepage(struct page *page, gfp_t gfp_mask)
|
||||
{
|
||||
/* XXX: this can't take locks that are held while we allocate memory */
|
||||
EBUG_ON(!PageLocked(page));
|
||||
EBUG_ON(PageWriteback(page));
|
||||
|
||||
@ -881,10 +877,12 @@ static int readpage_add_page(struct readpages_iter *iter, struct page *page)
|
||||
int ret;
|
||||
|
||||
prefetchw(&page->flags);
|
||||
page_state_init_for_read(page);
|
||||
|
||||
ret = add_to_page_cache_lru(page, iter->mapping,
|
||||
page->index, GFP_NOFS);
|
||||
if (!ret)
|
||||
page_state_init_for_read(page);
|
||||
|
||||
put_page(page);
|
||||
return ret;
|
||||
}
|
||||
@ -992,12 +990,13 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
|
||||
int flags = BCH_READ_RETRY_IF_STALE|
|
||||
BCH_READ_MAY_PROMOTE;
|
||||
|
||||
rbio->c = c;
|
||||
rbio->start_time = local_clock();
|
||||
|
||||
while (1) {
|
||||
struct extent_pick_ptr pick;
|
||||
BKEY_PADDED(k) tmp;
|
||||
struct bkey_s_c k;
|
||||
unsigned bytes;
|
||||
bool is_last;
|
||||
|
||||
bch2_btree_iter_set_pos(iter, POS(inum, bio->bi_iter.bi_sector));
|
||||
|
||||
@ -1016,45 +1015,37 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
|
||||
bch2_btree_iter_unlock(iter);
|
||||
k = bkey_i_to_s_c(&tmp.k);
|
||||
|
||||
bch2_extent_pick_ptr(c, k, NULL, &pick);
|
||||
if (IS_ERR(pick.ca)) {
|
||||
bcache_io_error(c, bio, "no device to read from");
|
||||
bio_endio(bio);
|
||||
return;
|
||||
}
|
||||
if (readpages_iter) {
|
||||
bool want_full_extent = false;
|
||||
|
||||
if (bkey_extent_is_data(k.k)) {
|
||||
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc)
|
||||
want_full_extent |= !!crc.csum_type |
|
||||
!!crc.compression_type;
|
||||
}
|
||||
|
||||
if (readpages_iter)
|
||||
readpage_bio_extend(readpages_iter,
|
||||
bio, k.k->p.offset,
|
||||
pick.ca &&
|
||||
(pick.crc.csum_type ||
|
||||
pick.crc.compression_type));
|
||||
want_full_extent);
|
||||
}
|
||||
|
||||
bytes = (min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
|
||||
bio->bi_iter.bi_sector) << 9;
|
||||
is_last = bytes == bio->bi_iter.bi_size;
|
||||
swap(bio->bi_iter.bi_size, bytes);
|
||||
|
||||
if (bytes == bio->bi_iter.bi_size)
|
||||
flags |= BCH_READ_LAST_FRAGMENT;
|
||||
|
||||
if (bkey_extent_is_allocation(k.k))
|
||||
bch2_add_page_sectors(bio, k);
|
||||
|
||||
if (pick.ca) {
|
||||
if (!is_last) {
|
||||
bio_inc_remaining(&rbio->bio);
|
||||
flags |= BCH_READ_MUST_CLONE;
|
||||
trace_read_split(&rbio->bio);
|
||||
}
|
||||
bch2_read_extent(c, rbio, k, flags);
|
||||
|
||||
bch2_read_extent(c, rbio, bkey_s_c_to_extent(k),
|
||||
&pick, flags);
|
||||
} else {
|
||||
zero_fill_bio(bio);
|
||||
|
||||
if (is_last)
|
||||
bio_endio(bio);
|
||||
}
|
||||
|
||||
if (is_last)
|
||||
if (flags & BCH_READ_LAST_FRAGMENT)
|
||||
return;
|
||||
|
||||
swap(bio->bi_iter.bi_size, bytes);
|
||||
@ -1487,6 +1478,194 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
|
||||
return copied;
|
||||
}
|
||||
|
||||
#define WRITE_BATCH_PAGES 32
|
||||
|
||||
static int __bch2_buffered_write(struct bch_inode_info *inode,
|
||||
struct address_space *mapping,
|
||||
struct iov_iter *iter,
|
||||
loff_t pos, unsigned len)
|
||||
{
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
struct page *pages[WRITE_BATCH_PAGES];
|
||||
unsigned long index = pos >> PAGE_SHIFT;
|
||||
unsigned offset = pos & (PAGE_SIZE - 1);
|
||||
unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
|
||||
unsigned i, copied = 0, nr_pages_copied = 0;
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(!len);
|
||||
BUG_ON(nr_pages > ARRAY_SIZE(pages));
|
||||
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
pages[i] = grab_cache_page_write_begin(mapping, index + i, 0);
|
||||
if (!pages[i]) {
|
||||
nr_pages = i;
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
if (offset && !PageUptodate(pages[0])) {
|
||||
ret = bch2_read_single_page(pages[0], mapping);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if ((pos + len) & (PAGE_SIZE - 1) &&
|
||||
!PageUptodate(pages[nr_pages - 1])) {
|
||||
if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) {
|
||||
zero_user(pages[nr_pages - 1], 0, PAGE_SIZE);
|
||||
} else {
|
||||
ret = bch2_read_single_page(pages[nr_pages - 1], mapping);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
ret = bch2_get_page_reservation(c, inode, pages[i], true);
|
||||
|
||||
if (ret && !PageUptodate(pages[i])) {
|
||||
ret = bch2_read_single_page(pages[i], mapping);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = bch2_get_page_reservation(c, inode, pages[i], true);
|
||||
}
|
||||
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (mapping_writably_mapped(mapping))
|
||||
for (i = 0; i < nr_pages; i++)
|
||||
flush_dcache_page(pages[i]);
|
||||
|
||||
while (copied < len) {
|
||||
struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
|
||||
unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
|
||||
unsigned pg_bytes = min_t(unsigned, len - copied,
|
||||
PAGE_SIZE - pg_offset);
|
||||
unsigned pg_copied = iov_iter_copy_from_user_atomic(page,
|
||||
iter, pg_offset, pg_bytes);
|
||||
|
||||
if (!pg_copied)
|
||||
break;
|
||||
|
||||
flush_dcache_page(page);
|
||||
iov_iter_advance(iter, pg_copied);
|
||||
copied += pg_copied;
|
||||
}
|
||||
|
||||
if (!copied)
|
||||
goto out;
|
||||
|
||||
nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
|
||||
inode->ei_last_dirtied = (unsigned long) current;
|
||||
|
||||
if (pos + copied > inode->v.i_size)
|
||||
i_size_write(&inode->v, pos + copied);
|
||||
|
||||
if (copied < len &&
|
||||
((offset + copied) & (PAGE_SIZE - 1))) {
|
||||
struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
|
||||
|
||||
if (!PageUptodate(page)) {
|
||||
zero_user(page, 0, PAGE_SIZE);
|
||||
copied -= (offset + copied) & (PAGE_SIZE - 1);
|
||||
}
|
||||
}
|
||||
out:
|
||||
for (i = 0; i < nr_pages_copied; i++) {
|
||||
if (!PageUptodate(pages[i]))
|
||||
SetPageUptodate(pages[i]);
|
||||
if (!PageDirty(pages[i]))
|
||||
set_page_dirty(pages[i]);
|
||||
unlock_page(pages[i]);
|
||||
put_page(pages[i]);
|
||||
}
|
||||
|
||||
for (i = nr_pages_copied; i < nr_pages; i++) {
|
||||
if (!PageDirty(pages[i]))
|
||||
bch2_put_page_reservation(c, inode, pages[i]);
|
||||
unlock_page(pages[i]);
|
||||
put_page(pages[i]);
|
||||
}
|
||||
|
||||
return copied ?: ret;
|
||||
}
|
||||
|
||||
static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
struct bch_inode_info *inode = file_bch_inode(file);
|
||||
loff_t pos = iocb->ki_pos;
|
||||
ssize_t written = 0;
|
||||
int ret = 0;
|
||||
|
||||
pagecache_add_get(&mapping->add_lock);
|
||||
|
||||
do {
|
||||
unsigned offset = pos & (PAGE_SIZE - 1);
|
||||
unsigned bytes = min_t(unsigned long, iov_iter_count(iter),
|
||||
PAGE_SIZE * WRITE_BATCH_PAGES - offset);
|
||||
again:
|
||||
/*
|
||||
* Bring in the user page that we will copy from _first_.
|
||||
* Otherwise there's a nasty deadlock on copying from the
|
||||
* same page as we're writing to, without it being marked
|
||||
* up-to-date.
|
||||
*
|
||||
* Not only is this an optimisation, but it is also required
|
||||
* to check that the address is actually valid, when atomic
|
||||
* usercopies are used, below.
|
||||
*/
|
||||
if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
|
||||
bytes = min_t(unsigned long, iov_iter_count(iter),
|
||||
PAGE_SIZE - offset);
|
||||
|
||||
if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
|
||||
ret = -EFAULT;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(fatal_signal_pending(current))) {
|
||||
ret = -EINTR;
|
||||
break;
|
||||
}
|
||||
|
||||
ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
|
||||
if (unlikely(ret < 0))
|
||||
break;
|
||||
|
||||
cond_resched();
|
||||
|
||||
if (unlikely(ret == 0)) {
|
||||
/*
|
||||
* If we were unable to copy any data at all, we must
|
||||
* fall back to a single segment length write.
|
||||
*
|
||||
* If we didn't fallback here, we could livelock
|
||||
* because not all segments in the iov can be copied at
|
||||
* once without a pagefault.
|
||||
*/
|
||||
bytes = min_t(unsigned long, PAGE_SIZE - offset,
|
||||
iov_iter_single_seg_count(iter));
|
||||
goto again;
|
||||
}
|
||||
pos += ret;
|
||||
written += ret;
|
||||
|
||||
balance_dirty_pages_ratelimited(mapping);
|
||||
} while (iov_iter_count(iter));
|
||||
|
||||
pagecache_add_put(&mapping->add_lock);
|
||||
|
||||
return written ? written : ret;
|
||||
}
|
||||
|
||||
/* O_DIRECT reads */
|
||||
|
||||
static void bch2_dio_read_complete(struct closure *cl)
|
||||
@ -1822,7 +2001,7 @@ static ssize_t __bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
||||
|
||||
ret = iocb->ki_flags & IOCB_DIRECT
|
||||
? bch2_direct_write(iocb, from)
|
||||
: generic_perform_write(file, from, iocb->ki_pos);
|
||||
: bch2_buffered_write(iocb, from);
|
||||
|
||||
if (likely(ret > 0))
|
||||
iocb->ki_pos += ret;
|
||||
|
@ -1028,6 +1028,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
|
||||
|
||||
inode_init_once(&inode->v);
|
||||
mutex_init(&inode->ei_update_lock);
|
||||
mutex_init(&inode->ei_quota_lock);
|
||||
inode->ei_journal_seq = 0;
|
||||
|
||||
return &inode->v;
|
||||
|
@ -15,6 +15,8 @@ struct bch_inode_info {
|
||||
u64 ei_journal_seq;
|
||||
u64 ei_quota_reserved;
|
||||
unsigned long ei_last_dirtied;
|
||||
|
||||
struct mutex ei_quota_lock;
|
||||
struct bch_qid ei_qid;
|
||||
|
||||
struct bch_hash_info ei_str_hash;
|
||||
|
924
libbcachefs/io.c
924
libbcachefs/io.c
File diff suppressed because it is too large
Load Diff
@ -16,7 +16,7 @@ void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
|
||||
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
|
||||
void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t);
|
||||
|
||||
void bch2_latency_acct(struct bch_dev *, unsigned, int);
|
||||
void bch2_latency_acct(struct bch_dev *, u64, int);
|
||||
|
||||
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
|
||||
enum bch_data_type, const struct bkey_i *);
|
||||
@ -99,40 +99,28 @@ struct cache_promote_op;
|
||||
struct extent_pick_ptr;
|
||||
|
||||
int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
|
||||
struct bkey_s_c_extent e, struct extent_pick_ptr *,
|
||||
unsigned);
|
||||
void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
|
||||
u64, struct bch_devs_mask *, unsigned);
|
||||
struct bkey_s_c, struct bch_devs_mask *, unsigned);
|
||||
void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
|
||||
|
||||
enum bch_read_flags {
|
||||
BCH_READ_RETRY_IF_STALE = 1 << 0,
|
||||
BCH_READ_MAY_PROMOTE = 1 << 1,
|
||||
BCH_READ_USER_MAPPED = 1 << 2,
|
||||
BCH_READ_NODECODE = 1 << 3,
|
||||
BCH_READ_LAST_FRAGMENT = 1 << 4,
|
||||
|
||||
/* internal: */
|
||||
BCH_READ_MUST_BOUNCE = 1 << 4,
|
||||
BCH_READ_MUST_CLONE = 1 << 5,
|
||||
BCH_READ_IN_RETRY = 1 << 6,
|
||||
BCH_READ_MUST_BOUNCE = 1 << 5,
|
||||
BCH_READ_MUST_CLONE = 1 << 6,
|
||||
BCH_READ_IN_RETRY = 1 << 7,
|
||||
};
|
||||
|
||||
static inline void bch2_read_extent(struct bch_fs *c,
|
||||
struct bch_read_bio *rbio,
|
||||
struct bkey_s_c_extent e,
|
||||
struct extent_pick_ptr *pick,
|
||||
struct bkey_s_c k,
|
||||
unsigned flags)
|
||||
{
|
||||
__bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags);
|
||||
}
|
||||
|
||||
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
||||
u64 inode)
|
||||
{
|
||||
BUG_ON(rbio->_state);
|
||||
__bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL,
|
||||
BCH_READ_RETRY_IF_STALE|
|
||||
BCH_READ_MAY_PROMOTE|
|
||||
BCH_READ_USER_MAPPED);
|
||||
__bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags);
|
||||
}
|
||||
|
||||
static inline struct bch_read_bio *rbio_init(struct bio *bio,
|
||||
@ -146,4 +134,7 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio,
|
||||
return rbio;
|
||||
}
|
||||
|
||||
void bch2_fs_io_exit(struct bch_fs *);
|
||||
int bch2_fs_io_init(struct bch_fs *);
|
||||
|
||||
#endif /* _BCACHEFS_IO_H */
|
||||
|
@ -14,6 +14,8 @@
|
||||
|
||||
struct bch_read_bio {
|
||||
struct bch_fs *c;
|
||||
u64 start_time;
|
||||
u64 submit_time;
|
||||
|
||||
/*
|
||||
* Reads will often have to be split, and if the extent being read from
|
||||
@ -35,17 +37,19 @@ struct bch_read_bio {
|
||||
*/
|
||||
struct bvec_iter bvec_iter;
|
||||
|
||||
unsigned submit_time_us;
|
||||
u8 flags;
|
||||
u16 flags;
|
||||
union {
|
||||
struct {
|
||||
u8 bounce:1,
|
||||
u16 bounce:1,
|
||||
split:1,
|
||||
kmalloc:1,
|
||||
have_ioref:1,
|
||||
narrow_crcs:1,
|
||||
hole:1,
|
||||
retry:2,
|
||||
context:2;
|
||||
};
|
||||
u8 _state;
|
||||
u16 _state;
|
||||
};
|
||||
|
||||
struct bch_devs_list devs_have;
|
||||
@ -66,20 +70,20 @@ struct bch_read_bio {
|
||||
|
||||
struct bch_write_bio {
|
||||
struct bch_fs *c;
|
||||
struct bch_dev *ca;
|
||||
struct bch_write_bio *parent;
|
||||
|
||||
u64 submit_time;
|
||||
|
||||
struct bch_devs_list failed;
|
||||
u8 order;
|
||||
u8 dev;
|
||||
|
||||
unsigned split:1,
|
||||
bounce:1,
|
||||
put_bio:1,
|
||||
have_io_ref:1,
|
||||
have_ioref:1,
|
||||
used_mempool:1;
|
||||
|
||||
unsigned submit_time_us;
|
||||
|
||||
struct bio bio;
|
||||
};
|
||||
|
||||
@ -87,6 +91,7 @@ struct bch_write_op {
|
||||
struct closure cl;
|
||||
struct bch_fs *c;
|
||||
struct workqueue_struct *io_wq;
|
||||
u64 start_time;
|
||||
|
||||
unsigned written; /* sectors */
|
||||
u16 flags;
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -112,72 +112,37 @@
|
||||
|
||||
#include "journal_types.h"
|
||||
|
||||
/*
|
||||
* Only used for holding the journal entries we read in btree_journal_read()
|
||||
* during cache_registration
|
||||
*/
|
||||
struct journal_replay {
|
||||
struct list_head list;
|
||||
struct bch_devs_list devs;
|
||||
/* must be last: */
|
||||
struct jset j;
|
||||
};
|
||||
|
||||
static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
|
||||
struct jset_entry *entry, unsigned type)
|
||||
{
|
||||
while (entry < vstruct_last(jset)) {
|
||||
if (entry->type == type)
|
||||
return entry;
|
||||
|
||||
entry = vstruct_next(entry);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#define for_each_jset_entry_type(entry, jset, type) \
|
||||
for (entry = (jset)->start; \
|
||||
(entry = __jset_entry_type_next(jset, entry, type)); \
|
||||
entry = vstruct_next(entry))
|
||||
|
||||
#define for_each_jset_key(k, _n, entry, jset) \
|
||||
for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \
|
||||
vstruct_for_each_safe(entry, k, _n)
|
||||
|
||||
#define JOURNAL_PIN (32 * 1024)
|
||||
|
||||
static inline bool journal_pin_active(struct journal_entry_pin *pin)
|
||||
{
|
||||
return pin->pin_list != NULL;
|
||||
}
|
||||
|
||||
static inline struct journal_entry_pin_list *
|
||||
journal_seq_pin(struct journal *j, u64 seq)
|
||||
{
|
||||
return &j->pin.data[seq & j->pin.mask];
|
||||
}
|
||||
|
||||
u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
|
||||
|
||||
void bch2_journal_pin_add(struct journal *, struct journal_res *,
|
||||
struct journal_entry_pin *, journal_pin_flush_fn);
|
||||
void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
|
||||
void bch2_journal_pin_add_if_older(struct journal *,
|
||||
struct journal_entry_pin *,
|
||||
struct journal_entry_pin *,
|
||||
journal_pin_flush_fn);
|
||||
int bch2_journal_flush_pins(struct journal *, u64);
|
||||
int bch2_journal_flush_all_pins(struct journal *);
|
||||
|
||||
struct closure;
|
||||
struct bch_fs;
|
||||
struct keylist;
|
||||
|
||||
struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *,
|
||||
enum btree_id, unsigned *);
|
||||
static inline void journal_wake(struct journal *j)
|
||||
{
|
||||
wake_up(&j->wait);
|
||||
closure_wake_up(&j->async_wait);
|
||||
}
|
||||
|
||||
int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
|
||||
static inline struct journal_buf *journal_cur_buf(struct journal *j)
|
||||
{
|
||||
return j->buf + j->reservations.idx;
|
||||
}
|
||||
|
||||
static inline struct journal_buf *journal_prev_buf(struct journal *j)
|
||||
{
|
||||
return j->buf + !j->reservations.idx;
|
||||
}
|
||||
|
||||
/* Sequence number of oldest dirty journal entry */
|
||||
|
||||
static inline u64 journal_last_seq(struct journal *j)
|
||||
{
|
||||
return j->pin.front;
|
||||
}
|
||||
|
||||
static inline u64 journal_cur_seq(struct journal *j)
|
||||
{
|
||||
BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
|
||||
|
||||
return j->pin.back - 1;
|
||||
}
|
||||
|
||||
u64 bch2_inode_journal_seq(struct journal *, u64);
|
||||
|
||||
@ -213,21 +178,18 @@ static inline unsigned jset_u64s(unsigned u64s)
|
||||
return u64s + sizeof(struct jset_entry) / sizeof(u64);
|
||||
}
|
||||
|
||||
static inline void bch2_journal_add_entry_at(struct journal_buf *buf,
|
||||
unsigned offset,
|
||||
unsigned type, enum btree_id id,
|
||||
unsigned level,
|
||||
const void *data, size_t u64s)
|
||||
static inline struct jset_entry *
|
||||
bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
|
||||
{
|
||||
struct jset_entry *entry = vstruct_idx(buf->data, offset);
|
||||
struct jset *jset = buf->data;
|
||||
struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));
|
||||
|
||||
memset(entry, 0, sizeof(*entry));
|
||||
entry->u64s = cpu_to_le16(u64s);
|
||||
entry->btree_id = id;
|
||||
entry->level = level;
|
||||
entry->type = type;
|
||||
entry->u64s = cpu_to_le16(u64s);
|
||||
|
||||
memcpy_u64s(entry->_data, data, u64s);
|
||||
le32_add_cpu(&jset->u64s, jset_u64s(u64s));
|
||||
|
||||
return entry;
|
||||
}
|
||||
|
||||
static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
|
||||
@ -236,21 +198,27 @@ static inline void bch2_journal_add_entry(struct journal *j, struct journal_res
|
||||
const void *data, unsigned u64s)
|
||||
{
|
||||
struct journal_buf *buf = &j->buf[res->idx];
|
||||
struct jset_entry *entry = vstruct_idx(buf->data, res->offset);
|
||||
unsigned actual = jset_u64s(u64s);
|
||||
|
||||
EBUG_ON(!res->ref);
|
||||
EBUG_ON(actual > res->u64s);
|
||||
|
||||
bch2_journal_add_entry_at(buf, res->offset, type,
|
||||
id, level, data, u64s);
|
||||
res->offset += actual;
|
||||
res->u64s -= actual;
|
||||
|
||||
memset(entry, 0, sizeof(*entry));
|
||||
entry->u64s = cpu_to_le16(u64s);
|
||||
entry->type = type;
|
||||
entry->btree_id = id;
|
||||
entry->level = level;
|
||||
memcpy_u64s(entry->_data, data, u64s);
|
||||
}
|
||||
|
||||
static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
|
||||
enum btree_id id, const struct bkey_i *k)
|
||||
{
|
||||
bch2_journal_add_entry(j, res, JOURNAL_ENTRY_BTREE_KEYS,
|
||||
bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
|
||||
id, 0, k, k->k.u64s);
|
||||
}
|
||||
|
||||
@ -292,7 +260,7 @@ static inline void bch2_journal_res_put(struct journal *j,
|
||||
|
||||
while (res->u64s)
|
||||
bch2_journal_add_entry(j, res,
|
||||
JOURNAL_ENTRY_BTREE_KEYS,
|
||||
BCH_JSET_ENTRY_btree_keys,
|
||||
0, 0, NULL, 0);
|
||||
|
||||
bch2_journal_buf_put(j, res->idx, false);
|
||||
@ -368,7 +336,6 @@ void bch2_journal_meta_async(struct journal *, struct closure *);
|
||||
int bch2_journal_flush_seq(struct journal *, u64);
|
||||
int bch2_journal_flush(struct journal *);
|
||||
int bch2_journal_meta(struct journal *);
|
||||
int bch2_journal_flush_device(struct journal *, int);
|
||||
|
||||
void bch2_journal_halt(struct journal *);
|
||||
|
||||
@ -385,10 +352,8 @@ static inline bool journal_flushes_device(struct bch_dev *ca)
|
||||
return true;
|
||||
}
|
||||
|
||||
void bch2_journal_start(struct bch_fs *);
|
||||
int bch2_journal_mark(struct bch_fs *, struct list_head *);
|
||||
void bch2_journal_entries_free(struct list_head *);
|
||||
int bch2_journal_read(struct bch_fs *, struct list_head *);
|
||||
int bch2_journal_replay(struct bch_fs *, struct list_head *);
|
||||
|
||||
static inline void bch2_journal_set_replay_done(struct journal *j)
|
||||
@ -404,6 +369,7 @@ int bch2_dev_journal_alloc(struct bch_dev *);
|
||||
|
||||
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
|
||||
void bch2_fs_journal_stop(struct journal *);
|
||||
void bch2_fs_journal_start(struct journal *);
|
||||
void bch2_dev_journal_exit(struct bch_dev *);
|
||||
int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
|
||||
void bch2_fs_journal_exit(struct journal *);
|
||||
|
1423
libbcachefs/journal_io.c
Normal file
1423
libbcachefs/journal_io.c
Normal file
File diff suppressed because it is too large
Load Diff
45
libbcachefs/journal_io.h
Normal file
45
libbcachefs/journal_io.h
Normal file
@ -0,0 +1,45 @@
|
||||
#ifndef _BCACHEFS_JOURNAL_IO_H
|
||||
#define _BCACHEFS_JOURNAL_IO_H
|
||||
|
||||
struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *,
|
||||
enum btree_id, unsigned *);
|
||||
|
||||
/*
|
||||
* Only used for holding the journal entries we read in btree_journal_read()
|
||||
* during cache_registration
|
||||
*/
|
||||
struct journal_replay {
|
||||
struct list_head list;
|
||||
struct bch_devs_list devs;
|
||||
/* must be last: */
|
||||
struct jset j;
|
||||
};
|
||||
|
||||
static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
|
||||
struct jset_entry *entry, unsigned type)
|
||||
{
|
||||
while (entry < vstruct_last(jset)) {
|
||||
if (entry->type == type)
|
||||
return entry;
|
||||
|
||||
entry = vstruct_next(entry);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#define for_each_jset_entry_type(entry, jset, type) \
|
||||
for (entry = (jset)->start; \
|
||||
(entry = __jset_entry_type_next(jset, entry, type)); \
|
||||
entry = vstruct_next(entry))
|
||||
|
||||
#define for_each_jset_key(k, _n, entry, jset) \
|
||||
for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \
|
||||
vstruct_for_each_safe(entry, k, _n)
|
||||
|
||||
int bch2_journal_read(struct bch_fs *, struct list_head *);
|
||||
|
||||
int bch2_journal_entry_sectors(struct journal *);
|
||||
void bch2_journal_write(struct closure *);
|
||||
|
||||
#endif /* _BCACHEFS_JOURNAL_IO_H */
|
411
libbcachefs/journal_reclaim.c
Normal file
411
libbcachefs/journal_reclaim.c
Normal file
@ -0,0 +1,411 @@
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "journal.h"
|
||||
#include "journal_reclaim.h"
|
||||
#include "replicas.h"
|
||||
#include "super.h"
|
||||
|
||||
/*
|
||||
* Journal entry pinning - machinery for holding a reference on a given journal
|
||||
* entry, holding it open to ensure it gets replayed during recovery:
|
||||
*/
|
||||
|
||||
static inline u64 journal_pin_seq(struct journal *j,
|
||||
struct journal_entry_pin_list *pin_list)
|
||||
{
|
||||
return fifo_entry_idx_abs(&j->pin, pin_list);
|
||||
}
|
||||
|
||||
u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
|
||||
{
|
||||
u64 ret = 0;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
if (journal_pin_active(pin))
|
||||
ret = journal_pin_seq(j, pin->pin_list);
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void __journal_pin_add(struct journal *j,
|
||||
struct journal_entry_pin_list *pin_list,
|
||||
struct journal_entry_pin *pin,
|
||||
journal_pin_flush_fn flush_fn)
|
||||
{
|
||||
BUG_ON(journal_pin_active(pin));
|
||||
BUG_ON(!atomic_read(&pin_list->count));
|
||||
|
||||
atomic_inc(&pin_list->count);
|
||||
pin->pin_list = pin_list;
|
||||
pin->flush = flush_fn;
|
||||
|
||||
if (flush_fn)
|
||||
list_add(&pin->list, &pin_list->list);
|
||||
else
|
||||
INIT_LIST_HEAD(&pin->list);
|
||||
|
||||
/*
|
||||
* If the journal is currently full, we might want to call flush_fn
|
||||
* immediately:
|
||||
*/
|
||||
journal_wake(j);
|
||||
}
|
||||
|
||||
void bch2_journal_pin_add(struct journal *j, u64 seq,
|
||||
struct journal_entry_pin *pin,
|
||||
journal_pin_flush_fn flush_fn)
|
||||
{
|
||||
spin_lock(&j->lock);
|
||||
__journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn);
|
||||
spin_unlock(&j->lock);
|
||||
}
|
||||
|
||||
static inline void __journal_pin_drop(struct journal *j,
|
||||
struct journal_entry_pin *pin)
|
||||
{
|
||||
struct journal_entry_pin_list *pin_list = pin->pin_list;
|
||||
|
||||
if (!journal_pin_active(pin))
|
||||
return;
|
||||
|
||||
pin->pin_list = NULL;
|
||||
list_del_init(&pin->list);
|
||||
|
||||
/*
|
||||
* Unpinning a journal entry make make journal_next_bucket() succeed, if
|
||||
* writing a new last_seq will now make another bucket available:
|
||||
*/
|
||||
if (atomic_dec_and_test(&pin_list->count) &&
|
||||
pin_list == &fifo_peek_front(&j->pin))
|
||||
bch2_journal_reclaim_fast(j);
|
||||
}
|
||||
|
||||
void bch2_journal_pin_drop(struct journal *j,
|
||||
struct journal_entry_pin *pin)
|
||||
{
|
||||
spin_lock(&j->lock);
|
||||
__journal_pin_drop(j, pin);
|
||||
spin_unlock(&j->lock);
|
||||
}
|
||||
|
||||
void bch2_journal_pin_add_if_older(struct journal *j,
|
||||
struct journal_entry_pin *src_pin,
|
||||
struct journal_entry_pin *pin,
|
||||
journal_pin_flush_fn flush_fn)
|
||||
{
|
||||
spin_lock(&j->lock);
|
||||
|
||||
if (journal_pin_active(src_pin) &&
|
||||
(!journal_pin_active(pin) ||
|
||||
journal_pin_seq(j, src_pin->pin_list) <
|
||||
journal_pin_seq(j, pin->pin_list))) {
|
||||
__journal_pin_drop(j, pin);
|
||||
__journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
|
||||
}
|
||||
|
||||
spin_unlock(&j->lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Journal reclaim: flush references to open journal entries to reclaim space in
|
||||
* the journal
|
||||
*
|
||||
* May be done by the journal code in the background as needed to free up space
|
||||
* for more journal entries, or as part of doing a clean shutdown, or to migrate
|
||||
* data off of a specific device:
|
||||
*/
|
||||
|
||||
/**
|
||||
* bch2_journal_reclaim_fast - do the fast part of journal reclaim
|
||||
*
|
||||
* Called from IO submission context, does not block. Cleans up after btree
|
||||
* write completions by advancing the journal pin and each cache's last_idx,
|
||||
* kicking off discards and background reclaim as necessary.
|
||||
*/
|
||||
void bch2_journal_reclaim_fast(struct journal *j)
|
||||
{
|
||||
struct journal_entry_pin_list temp;
|
||||
bool popped = false;
|
||||
|
||||
lockdep_assert_held(&j->lock);
|
||||
|
||||
/*
|
||||
* Unpin journal entries whose reference counts reached zero, meaning
|
||||
* all btree nodes got written out
|
||||
*/
|
||||
while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
|
||||
BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
|
||||
BUG_ON(!fifo_pop(&j->pin, temp));
|
||||
popped = true;
|
||||
}
|
||||
|
||||
if (popped)
|
||||
journal_wake(j);
|
||||
}
|
||||
|
||||
static struct journal_entry_pin *
|
||||
__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
|
||||
{
|
||||
struct journal_entry_pin_list *pin_list;
|
||||
struct journal_entry_pin *ret;
|
||||
u64 iter;
|
||||
|
||||
/* no need to iterate over empty fifo entries: */
|
||||
bch2_journal_reclaim_fast(j);
|
||||
|
||||
fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
|
||||
if (iter > seq_to_flush)
|
||||
break;
|
||||
|
||||
ret = list_first_entry_or_null(&pin_list->list,
|
||||
struct journal_entry_pin, list);
|
||||
if (ret) {
|
||||
/* must be list_del_init(), see bch2_journal_pin_drop() */
|
||||
list_move(&ret->list, &pin_list->flushed);
|
||||
*seq = iter;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct journal_entry_pin *
|
||||
journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
|
||||
{
|
||||
struct journal_entry_pin *ret;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
ret = __journal_get_next_pin(j, seq_to_flush, seq);
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
|
||||
{
|
||||
bool ret;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
ret = ja->nr &&
|
||||
(ja->last_idx != ja->cur_idx &&
|
||||
ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* bch2_journal_reclaim_work - free up journal buckets
|
||||
*
|
||||
* Background journal reclaim writes out btree nodes. It should be run
|
||||
* early enough so that we never completely run out of journal buckets.
|
||||
*
|
||||
* High watermarks for triggering background reclaim:
|
||||
* - FIFO has fewer than 512 entries left
|
||||
* - fewer than 25% journal buckets free
|
||||
*
|
||||
* Background reclaim runs until low watermarks are reached:
|
||||
* - FIFO has more than 1024 entries left
|
||||
* - more than 50% journal buckets free
|
||||
*
|
||||
* As long as a reclaim can complete in the time it takes to fill up
|
||||
* 512 journal entries or 25% of all journal buckets, then
|
||||
* journal_next_bucket() should not stall.
|
||||
*/
|
||||
void bch2_journal_reclaim_work(struct work_struct *work)
|
||||
{
|
||||
struct bch_fs *c = container_of(to_delayed_work(work),
|
||||
struct bch_fs, journal.reclaim_work);
|
||||
struct journal *j = &c->journal;
|
||||
struct bch_dev *ca;
|
||||
struct journal_entry_pin *pin;
|
||||
u64 seq, seq_to_flush = 0;
|
||||
unsigned iter, bucket_to_flush;
|
||||
unsigned long next_flush;
|
||||
bool reclaim_lock_held = false, need_flush;
|
||||
|
||||
/*
|
||||
* Advance last_idx to point to the oldest journal entry containing
|
||||
* btree node updates that have not yet been written out
|
||||
*/
|
||||
for_each_rw_member(ca, c, iter) {
|
||||
struct journal_device *ja = &ca->journal;
|
||||
|
||||
if (!ja->nr)
|
||||
continue;
|
||||
|
||||
while (should_discard_bucket(j, ja)) {
|
||||
if (!reclaim_lock_held) {
|
||||
/*
|
||||
* ugh:
|
||||
* might be called from __journal_res_get()
|
||||
* under wait_event() - have to go back to
|
||||
* TASK_RUNNING before doing something that
|
||||
* would block, but only if we're doing work:
|
||||
*/
|
||||
__set_current_state(TASK_RUNNING);
|
||||
|
||||
mutex_lock(&j->reclaim_lock);
|
||||
reclaim_lock_held = true;
|
||||
/* recheck under reclaim_lock: */
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ca->mi.discard &&
|
||||
blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
|
||||
blkdev_issue_discard(ca->disk_sb.bdev,
|
||||
bucket_to_sector(ca,
|
||||
ja->buckets[ja->last_idx]),
|
||||
ca->mi.bucket_size, GFP_NOIO, 0);
|
||||
|
||||
spin_lock(&j->lock);
|
||||
ja->last_idx = (ja->last_idx + 1) % ja->nr;
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
journal_wake(j);
|
||||
}
|
||||
|
||||
/*
|
||||
* Write out enough btree nodes to free up 50% journal
|
||||
* buckets
|
||||
*/
|
||||
spin_lock(&j->lock);
|
||||
bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
|
||||
seq_to_flush = max_t(u64, seq_to_flush,
|
||||
ja->bucket_seq[bucket_to_flush]);
|
||||
spin_unlock(&j->lock);
|
||||
}
|
||||
|
||||
if (reclaim_lock_held)
|
||||
mutex_unlock(&j->reclaim_lock);
|
||||
|
||||
/* Also flush if the pin fifo is more than half full */
|
||||
spin_lock(&j->lock);
|
||||
seq_to_flush = max_t(s64, seq_to_flush,
|
||||
(s64) journal_cur_seq(j) -
|
||||
(j->pin.size >> 1));
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
/*
|
||||
* If it's been longer than j->reclaim_delay_ms since we last flushed,
|
||||
* make sure to flush at least one journal pin:
|
||||
*/
|
||||
next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
|
||||
need_flush = time_after(jiffies, next_flush);
|
||||
|
||||
while ((pin = journal_get_next_pin(j, need_flush
|
||||
? U64_MAX
|
||||
: seq_to_flush, &seq))) {
|
||||
__set_current_state(TASK_RUNNING);
|
||||
pin->flush(j, pin, seq);
|
||||
need_flush = false;
|
||||
|
||||
j->last_flushed = jiffies;
|
||||
}
|
||||
|
||||
if (!test_bit(BCH_FS_RO, &c->flags))
|
||||
queue_delayed_work(system_freezable_wq, &j->reclaim_work,
|
||||
msecs_to_jiffies(j->reclaim_delay_ms));
|
||||
}
|
||||
|
||||
static int journal_flush_done(struct journal *j, u64 seq_to_flush,
|
||||
struct journal_entry_pin **pin,
|
||||
u64 *pin_seq)
|
||||
{
|
||||
int ret;
|
||||
|
||||
*pin = NULL;
|
||||
|
||||
ret = bch2_journal_error(j);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
/*
|
||||
* If journal replay hasn't completed, the unreplayed journal entries
|
||||
* hold refs on their corresponding sequence numbers
|
||||
*/
|
||||
ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
|
||||
!test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
|
||||
journal_last_seq(j) > seq_to_flush ||
|
||||
(fifo_used(&j->pin) == 1 &&
|
||||
atomic_read(&fifo_peek_front(&j->pin).count) == 1);
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
|
||||
{
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
struct journal_entry_pin *pin;
|
||||
u64 pin_seq;
|
||||
bool flush;
|
||||
|
||||
if (!test_bit(JOURNAL_STARTED, &j->flags))
|
||||
return 0;
|
||||
again:
|
||||
wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq));
|
||||
if (pin) {
|
||||
/* flushing a journal pin might cause a new one to be added: */
|
||||
pin->flush(j, pin, pin_seq);
|
||||
goto again;
|
||||
}
|
||||
|
||||
spin_lock(&j->lock);
|
||||
flush = journal_last_seq(j) != j->last_seq_ondisk ||
|
||||
(seq_to_flush == U64_MAX && c->btree_roots_dirty);
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
return flush ? bch2_journal_meta(j) : 0;
|
||||
}
|
||||
|
||||
int bch2_journal_flush_all_pins(struct journal *j)
|
||||
{
|
||||
return bch2_journal_flush_pins(j, U64_MAX);
|
||||
}
|
||||
|
||||
int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
|
||||
{
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
struct journal_entry_pin_list *p;
|
||||
struct bch_devs_list devs;
|
||||
u64 iter, seq = 0;
|
||||
int ret = 0;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
fifo_for_each_entry_ptr(p, &j->pin, iter)
|
||||
if (dev_idx >= 0
|
||||
? bch2_dev_list_has_dev(p->devs, dev_idx)
|
||||
: p->devs.nr < c->opts.metadata_replicas)
|
||||
seq = iter;
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
ret = bch2_journal_flush_pins(j, seq);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
mutex_lock(&c->replicas_gc_lock);
|
||||
bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
|
||||
|
||||
seq = 0;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
while (!ret && seq < j->pin.back) {
|
||||
seq = max(seq, journal_last_seq(j));
|
||||
devs = journal_seq_pin(j, seq)->devs;
|
||||
seq++;
|
||||
|
||||
spin_unlock(&j->lock);
|
||||
ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
|
||||
spin_lock(&j->lock);
|
||||
}
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
bch2_replicas_gc_end(c, ret);
|
||||
mutex_unlock(&c->replicas_gc_lock);
|
||||
|
||||
return ret;
|
||||
}
|
36
libbcachefs/journal_reclaim.h
Normal file
36
libbcachefs/journal_reclaim.h
Normal file
@ -0,0 +1,36 @@
|
||||
#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
|
||||
#define _BCACHEFS_JOURNAL_RECLAIM_H
|
||||
|
||||
#define JOURNAL_PIN (32 * 1024)
|
||||
|
||||
static inline bool journal_pin_active(struct journal_entry_pin *pin)
|
||||
{
|
||||
return pin->pin_list != NULL;
|
||||
}
|
||||
|
||||
static inline struct journal_entry_pin_list *
|
||||
journal_seq_pin(struct journal *j, u64 seq)
|
||||
{
|
||||
BUG_ON(seq < j->pin.front || seq >= j->pin.back);
|
||||
|
||||
return &j->pin.data[seq & j->pin.mask];
|
||||
}
|
||||
|
||||
u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
|
||||
|
||||
void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
|
||||
journal_pin_flush_fn);
|
||||
void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
|
||||
void bch2_journal_pin_add_if_older(struct journal *,
|
||||
struct journal_entry_pin *,
|
||||
struct journal_entry_pin *,
|
||||
journal_pin_flush_fn);
|
||||
|
||||
void bch2_journal_reclaim_fast(struct journal *);
|
||||
void bch2_journal_reclaim_work(struct work_struct *);
|
||||
|
||||
int bch2_journal_flush_pins(struct journal *, u64);
|
||||
int bch2_journal_flush_all_pins(struct journal *);
|
||||
int bch2_journal_flush_device_pins(struct journal *, int);
|
||||
|
||||
#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
|
358
libbcachefs/journal_seq_blacklist.c
Normal file
358
libbcachefs/journal_seq_blacklist.c
Normal file
@ -0,0 +1,358 @@
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "btree_update.h"
|
||||
#include "btree_update_interior.h"
|
||||
#include "error.h"
|
||||
#include "journal.h"
|
||||
#include "journal_io.h"
|
||||
#include "journal_reclaim.h"
|
||||
#include "journal_seq_blacklist.h"
|
||||
|
||||
/*
|
||||
* journal_seq_blacklist machinery:
|
||||
*
|
||||
* To guarantee order of btree updates after a crash, we need to detect when a
|
||||
* btree node entry (bset) is newer than the newest journal entry that was
|
||||
* successfully written, and ignore it - effectively ignoring any btree updates
|
||||
* that didn't make it into the journal.
|
||||
*
|
||||
* If we didn't do this, we might have two btree nodes, a and b, both with
|
||||
* updates that weren't written to the journal yet: if b was updated after a,
|
||||
* but b was flushed and not a - oops; on recovery we'll find that the updates
|
||||
* to b happened, but not the updates to a that happened before it.
|
||||
*
|
||||
* Ignoring bsets that are newer than the newest journal entry is always safe,
|
||||
* because everything they contain will also have been journalled - and must
|
||||
* still be present in the journal on disk until a journal entry has been
|
||||
* written _after_ that bset was written.
|
||||
*
|
||||
* To accomplish this, bsets record the newest journal sequence number they
|
||||
* contain updates for; then, on startup, the btree code queries the journal
|
||||
* code to ask "Is this sequence number newer than the newest journal entry? If
|
||||
* so, ignore it."
|
||||
*
|
||||
* When this happens, we must blacklist that journal sequence number: the
|
||||
* journal must not write any entries with that sequence number, and it must
|
||||
* record that it was blacklisted so that a) on recovery we don't think we have
|
||||
* missing journal entries and b) so that the btree code continues to ignore
|
||||
* that bset, until that btree node is rewritten.
|
||||
*
|
||||
* Blacklisted journal sequence numbers are themselves recorded as entries in
|
||||
* the journal.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Called when journal needs to evict a blacklist entry to reclaim space: find
|
||||
* any btree nodes that refer to the blacklist journal sequence numbers, and
|
||||
* rewrite them:
|
||||
*/
|
||||
static void journal_seq_blacklist_flush(struct journal *j,
|
||||
struct journal_entry_pin *pin, u64 seq)
|
||||
{
|
||||
struct bch_fs *c =
|
||||
container_of(j, struct bch_fs, journal);
|
||||
struct journal_seq_blacklist *bl =
|
||||
container_of(pin, struct journal_seq_blacklist, pin);
|
||||
struct blacklisted_node n;
|
||||
struct closure cl;
|
||||
unsigned i;
|
||||
int ret;
|
||||
|
||||
closure_init_stack(&cl);
|
||||
|
||||
for (i = 0;; i++) {
|
||||
struct btree_iter iter;
|
||||
struct btree *b;
|
||||
|
||||
mutex_lock(&j->blacklist_lock);
|
||||
if (i >= bl->nr_entries) {
|
||||
mutex_unlock(&j->blacklist_lock);
|
||||
break;
|
||||
}
|
||||
n = bl->entries[i];
|
||||
mutex_unlock(&j->blacklist_lock);
|
||||
|
||||
__bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, 0, 0, 0);
|
||||
|
||||
b = bch2_btree_iter_peek_node(&iter);
|
||||
|
||||
/* The node might have already been rewritten: */
|
||||
|
||||
if (b->data->keys.seq == n.seq) {
|
||||
ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0);
|
||||
if (ret) {
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
bch2_fs_fatal_error(c,
|
||||
"error %i rewriting btree node with blacklisted journal seq",
|
||||
ret);
|
||||
bch2_journal_halt(j);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
}
|
||||
|
||||
for (i = 0;; i++) {
|
||||
struct btree_update *as;
|
||||
struct pending_btree_node_free *d;
|
||||
|
||||
mutex_lock(&j->blacklist_lock);
|
||||
if (i >= bl->nr_entries) {
|
||||
mutex_unlock(&j->blacklist_lock);
|
||||
break;
|
||||
}
|
||||
n = bl->entries[i];
|
||||
mutex_unlock(&j->blacklist_lock);
|
||||
redo_wait:
|
||||
mutex_lock(&c->btree_interior_update_lock);
|
||||
|
||||
/*
|
||||
* Is the node on the list of pending interior node updates -
|
||||
* being freed? If so, wait for that to finish:
|
||||
*/
|
||||
for_each_pending_btree_node_free(c, as, d)
|
||||
if (n.seq == d->seq &&
|
||||
n.btree_id == d->btree_id &&
|
||||
!d->level &&
|
||||
!bkey_cmp(n.pos, d->key.k.p)) {
|
||||
closure_wait(&as->wait, &cl);
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
closure_sync(&cl);
|
||||
goto redo_wait;
|
||||
}
|
||||
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
}
|
||||
|
||||
mutex_lock(&j->blacklist_lock);
|
||||
|
||||
bch2_journal_pin_drop(j, &bl->pin);
|
||||
list_del(&bl->list);
|
||||
kfree(bl->entries);
|
||||
kfree(bl);
|
||||
|
||||
mutex_unlock(&j->blacklist_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine if a particular sequence number is blacklisted - if so, return
|
||||
* blacklist entry:
|
||||
*/
|
||||
struct journal_seq_blacklist *
|
||||
bch2_journal_seq_blacklist_find(struct journal *j, u64 seq)
|
||||
{
|
||||
struct journal_seq_blacklist *bl;
|
||||
|
||||
lockdep_assert_held(&j->blacklist_lock);
|
||||
|
||||
list_for_each_entry(bl, &j->seq_blacklist, list)
|
||||
if (seq >= bl->start && seq <= bl->end)
|
||||
return bl;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a new, in memory blacklist entry:
|
||||
*/
|
||||
static struct journal_seq_blacklist *
|
||||
bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end)
|
||||
{
|
||||
struct journal_seq_blacklist *bl;
|
||||
|
||||
lockdep_assert_held(&j->blacklist_lock);
|
||||
|
||||
/*
|
||||
* When we start the journal, bch2_journal_start() will skip over @seq:
|
||||
*/
|
||||
|
||||
bl = kzalloc(sizeof(*bl), GFP_KERNEL);
|
||||
if (!bl)
|
||||
return NULL;
|
||||
|
||||
bl->start = start;
|
||||
bl->end = end;
|
||||
|
||||
list_add_tail(&bl->list, &j->seq_blacklist);
|
||||
return bl;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if @seq is newer than the most recent journal entry that got
|
||||
* written, and data corresponding to @seq should be ignored - also marks @seq
|
||||
* as blacklisted so that on future restarts the corresponding data will still
|
||||
* be ignored:
|
||||
*/
|
||||
int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
|
||||
{
|
||||
struct journal *j = &c->journal;
|
||||
struct journal_seq_blacklist *bl = NULL;
|
||||
struct blacklisted_node *n;
|
||||
u64 journal_seq;
|
||||
int ret = 0;
|
||||
|
||||
if (!seq)
|
||||
return 0;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
journal_seq = journal_cur_seq(j);
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
/* Interier updates aren't journalled: */
|
||||
BUG_ON(b->level);
|
||||
BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
|
||||
|
||||
/*
|
||||
* Decrease this back to j->seq + 2 when we next rev the on disk format:
|
||||
* increasing it temporarily to work around bug in old kernels
|
||||
*/
|
||||
fsck_err_on(seq > journal_seq + 4, c,
|
||||
"bset journal seq too far in the future: %llu > %llu",
|
||||
seq, journal_seq);
|
||||
|
||||
if (seq <= journal_seq &&
|
||||
list_empty_careful(&j->seq_blacklist))
|
||||
return 0;
|
||||
|
||||
mutex_lock(&j->blacklist_lock);
|
||||
|
||||
if (seq <= journal_seq) {
|
||||
bl = bch2_journal_seq_blacklist_find(j, seq);
|
||||
if (!bl)
|
||||
goto out;
|
||||
} else {
|
||||
bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
|
||||
b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
|
||||
|
||||
if (!j->new_blacklist) {
|
||||
j->new_blacklist = bch2_journal_seq_blacklisted_new(j,
|
||||
journal_seq + 1,
|
||||
journal_seq + 1);
|
||||
if (!j->new_blacklist) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
bl = j->new_blacklist;
|
||||
bl->end = max(bl->end, seq);
|
||||
}
|
||||
|
||||
for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
|
||||
if (b->data->keys.seq == n->seq &&
|
||||
b->btree_id == n->btree_id &&
|
||||
!bkey_cmp(b->key.k.p, n->pos))
|
||||
goto found_entry;
|
||||
|
||||
if (!bl->nr_entries ||
|
||||
is_power_of_2(bl->nr_entries)) {
|
||||
n = krealloc(bl->entries,
|
||||
max(bl->nr_entries * 2, 8UL) * sizeof(*n),
|
||||
GFP_KERNEL);
|
||||
if (!n) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
bl->entries = n;
|
||||
}
|
||||
|
||||
bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
|
||||
.seq = b->data->keys.seq,
|
||||
.btree_id = b->btree_id,
|
||||
.pos = b->key.k.p,
|
||||
};
|
||||
found_entry:
|
||||
ret = 1;
|
||||
out:
|
||||
fsck_err:
|
||||
mutex_unlock(&j->blacklist_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __bch2_journal_seq_blacklist_read(struct journal *j,
|
||||
struct journal_replay *i,
|
||||
u64 start, u64 end)
|
||||
{
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
struct journal_seq_blacklist *bl;
|
||||
|
||||
bch_verbose(c, "blacklisting existing journal seq %llu-%llu",
|
||||
start, end);
|
||||
|
||||
bl = bch2_journal_seq_blacklisted_new(j, start, end);
|
||||
if (!bl)
|
||||
return -ENOMEM;
|
||||
|
||||
bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin,
|
||||
journal_seq_blacklist_flush);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* After reading the journal, find existing journal seq blacklist entries and
|
||||
* read them into memory:
|
||||
*/
|
||||
int bch2_journal_seq_blacklist_read(struct journal *j,
|
||||
struct journal_replay *i)
|
||||
{
|
||||
struct jset_entry *entry;
|
||||
int ret = 0;
|
||||
|
||||
vstruct_for_each(&i->j, entry) {
|
||||
switch (entry->type) {
|
||||
case BCH_JSET_ENTRY_blacklist: {
|
||||
struct jset_entry_blacklist *bl_entry =
|
||||
container_of(entry, struct jset_entry_blacklist, entry);
|
||||
|
||||
ret = __bch2_journal_seq_blacklist_read(j, i,
|
||||
le64_to_cpu(bl_entry->seq),
|
||||
le64_to_cpu(bl_entry->seq));
|
||||
break;
|
||||
}
|
||||
case BCH_JSET_ENTRY_blacklist_v2: {
|
||||
struct jset_entry_blacklist_v2 *bl_entry =
|
||||
container_of(entry, struct jset_entry_blacklist_v2, entry);
|
||||
|
||||
ret = __bch2_journal_seq_blacklist_read(j, i,
|
||||
le64_to_cpu(bl_entry->start),
|
||||
le64_to_cpu(bl_entry->end));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* After reading the journal and walking the btree, we might have new journal
|
||||
* sequence numbers to blacklist - add entries to the next journal entry to be
|
||||
* written:
|
||||
*/
|
||||
void bch2_journal_seq_blacklist_write(struct journal *j)
|
||||
{
|
||||
struct journal_seq_blacklist *bl = j->new_blacklist;
|
||||
struct jset_entry_blacklist_v2 *bl_entry;
|
||||
struct jset_entry *entry;
|
||||
|
||||
if (!bl)
|
||||
return;
|
||||
|
||||
entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j),
|
||||
(sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64));
|
||||
|
||||
bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
|
||||
bl_entry->entry.type = BCH_JSET_ENTRY_blacklist_v2;
|
||||
bl_entry->start = cpu_to_le64(bl->start);
|
||||
bl_entry->end = cpu_to_le64(bl->end);
|
||||
|
||||
bch2_journal_pin_add(j,
|
||||
journal_cur_seq(j),
|
||||
&bl->pin,
|
||||
journal_seq_blacklist_flush);
|
||||
|
||||
j->new_blacklist = NULL;
|
||||
}
|
13
libbcachefs/journal_seq_blacklist.h
Normal file
13
libbcachefs/journal_seq_blacklist.h
Normal file
@ -0,0 +1,13 @@
|
||||
#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
|
||||
#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
|
||||
|
||||
struct journal_replay;
|
||||
|
||||
struct journal_seq_blacklist *
|
||||
bch2_journal_seq_blacklist_find(struct journal *, u64);
|
||||
int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
|
||||
int bch2_journal_seq_blacklist_read(struct journal *,
|
||||
struct journal_replay *);
|
||||
void bch2_journal_seq_blacklist_write(struct journal *);
|
||||
|
||||
#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
|
@ -59,8 +59,9 @@ struct blacklisted_node {
|
||||
|
||||
struct journal_seq_blacklist {
|
||||
struct list_head list;
|
||||
u64 seq;
|
||||
bool written;
|
||||
u64 start;
|
||||
u64 end;
|
||||
|
||||
struct journal_entry_pin pin;
|
||||
|
||||
struct blacklisted_node *entries;
|
||||
@ -171,10 +172,11 @@ struct journal {
|
||||
u64 front, back, size, mask;
|
||||
struct journal_entry_pin_list *data;
|
||||
} pin;
|
||||
struct journal_entry_pin_list *replay_pin_list;
|
||||
u64 replay_journal_seq;
|
||||
|
||||
struct mutex blacklist_lock;
|
||||
struct list_head seq_blacklist;
|
||||
struct journal_seq_blacklist *new_blacklist;
|
||||
|
||||
BKEY_PADDED(key);
|
||||
struct write_point wp;
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include "buckets.h"
|
||||
#include "inode.h"
|
||||
#include "io.h"
|
||||
#include "journal_reclaim.h"
|
||||
#include "move.h"
|
||||
#include "replicas.h"
|
||||
#include "super-io.h"
|
||||
@ -22,7 +23,6 @@ struct moving_io {
|
||||
struct closure cl;
|
||||
bool read_completed;
|
||||
|
||||
unsigned read_dev;
|
||||
unsigned read_sectors;
|
||||
unsigned write_sectors;
|
||||
|
||||
@ -42,7 +42,7 @@ struct moving_context {
|
||||
struct list_head reads;
|
||||
|
||||
/* in flight sectors: */
|
||||
atomic_t read_sectors[BCH_SB_MEMBERS_MAX];
|
||||
atomic_t read_sectors;
|
||||
atomic_t write_sectors;
|
||||
|
||||
wait_queue_head_t wait;
|
||||
@ -306,7 +306,8 @@ static void move_write(struct closure *cl)
|
||||
{
|
||||
struct moving_io *io = container_of(cl, struct moving_io, cl);
|
||||
|
||||
if (likely(!io->rbio.bio.bi_status)) {
|
||||
if (likely(!io->rbio.bio.bi_status &&
|
||||
!io->rbio.hole)) {
|
||||
bch2_migrate_read_done(&io->write, &io->rbio);
|
||||
|
||||
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
|
||||
@ -330,7 +331,7 @@ static void move_read_endio(struct bio *bio)
|
||||
struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
|
||||
struct moving_context *ctxt = io->write.ctxt;
|
||||
|
||||
atomic_sub(io->read_sectors, &ctxt->read_sectors[io->read_dev]);
|
||||
atomic_sub(io->read_sectors, &ctxt->read_sectors);
|
||||
io->read_completed = true;
|
||||
|
||||
if (next_pending_write(ctxt))
|
||||
@ -376,7 +377,6 @@ static int bch2_move_extent(struct bch_fs *c,
|
||||
enum data_cmd data_cmd,
|
||||
struct data_opts data_opts)
|
||||
{
|
||||
struct extent_pick_ptr pick;
|
||||
struct moving_io *io;
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
@ -387,12 +387,8 @@ static int bch2_move_extent(struct bch_fs *c,
|
||||
atomic_read(&ctxt->write_sectors) <
|
||||
SECTORS_IN_FLIGHT_PER_DEVICE);
|
||||
|
||||
bch2_extent_pick_ptr(c, e.s_c, NULL, &pick);
|
||||
if (IS_ERR_OR_NULL(pick.ca))
|
||||
return pick.ca ? PTR_ERR(pick.ca) : 0;
|
||||
|
||||
move_ctxt_wait_event(ctxt,
|
||||
atomic_read(&ctxt->read_sectors[pick.ca->dev_idx]) <
|
||||
atomic_read(&ctxt->read_sectors) <
|
||||
SECTORS_IN_FLIGHT_PER_DEVICE);
|
||||
|
||||
/* write path might have to decompress data: */
|
||||
@ -406,8 +402,7 @@ static int bch2_move_extent(struct bch_fs *c,
|
||||
goto err;
|
||||
|
||||
io->write.ctxt = ctxt;
|
||||
io->read_dev = pick.ca->dev_idx;
|
||||
io->read_sectors = pick.crc.uncompressed_size;
|
||||
io->read_sectors = e.k->size;
|
||||
io->write_sectors = e.k->size;
|
||||
|
||||
bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
|
||||
@ -421,6 +416,7 @@ static int bch2_move_extent(struct bch_fs *c,
|
||||
|
||||
io->rbio.opts = io_opts;
|
||||
bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
|
||||
io->rbio.bio.bi_vcnt = pages;
|
||||
bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
|
||||
io->rbio.bio.bi_iter.bi_size = sectors << 9;
|
||||
|
||||
@ -438,7 +434,7 @@ static int bch2_move_extent(struct bch_fs *c,
|
||||
|
||||
trace_move_extent(e.k);
|
||||
|
||||
atomic_add(io->read_sectors, &ctxt->read_sectors[io->read_dev]);
|
||||
atomic_add(io->read_sectors, &ctxt->read_sectors);
|
||||
list_add_tail(&io->list, &ctxt->reads);
|
||||
|
||||
/*
|
||||
@ -446,14 +442,15 @@ static int bch2_move_extent(struct bch_fs *c,
|
||||
* ctxt when doing wakeup
|
||||
*/
|
||||
closure_get(&ctxt->cl);
|
||||
bch2_read_extent(c, &io->rbio, e, &pick, BCH_READ_NODECODE);
|
||||
bch2_read_extent(c, &io->rbio, e.s_c,
|
||||
BCH_READ_NODECODE|
|
||||
BCH_READ_LAST_FRAGMENT);
|
||||
return 0;
|
||||
err_free_pages:
|
||||
bio_free_pages(&io->write.op.wbio.bio);
|
||||
err_free:
|
||||
kfree(io);
|
||||
err:
|
||||
percpu_ref_put(&pick.ca->io_ref);
|
||||
trace_move_alloc_fail(e.k);
|
||||
return ret;
|
||||
}
|
||||
@ -728,7 +725,7 @@ int bch2_data_job(struct bch_fs *c,
|
||||
switch (op.op) {
|
||||
case BCH_DATA_OP_REREPLICATE:
|
||||
stats->data_type = BCH_DATA_JOURNAL;
|
||||
ret = bch2_journal_flush_device(&c->journal, -1);
|
||||
ret = bch2_journal_flush_device_pins(&c->journal, -1);
|
||||
|
||||
ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
|
||||
ret = bch2_gc_btree_replicas(c) ?: ret;
|
||||
@ -745,7 +742,7 @@ int bch2_data_job(struct bch_fs *c,
|
||||
return -EINVAL;
|
||||
|
||||
stats->data_type = BCH_DATA_JOURNAL;
|
||||
ret = bch2_journal_flush_device(&c->journal, op.migrate.dev);
|
||||
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
|
||||
|
||||
ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
|
||||
ret = bch2_gc_btree_replicas(c) ?: ret;
|
||||
|
@ -26,6 +26,8 @@
|
||||
#include "inode.h"
|
||||
#include "io.h"
|
||||
#include "journal.h"
|
||||
#include "journal_io.h"
|
||||
#include "journal_reclaim.h"
|
||||
#include "keylist.h"
|
||||
#include "move.h"
|
||||
#include "migrate.h"
|
||||
@ -396,9 +398,15 @@ err:
|
||||
|
||||
static void bch2_fs_free(struct bch_fs *c)
|
||||
{
|
||||
#define BCH_TIME_STAT(name) \
|
||||
bch2_time_stats_exit(&c->name##_time);
|
||||
BCH_TIME_STATS()
|
||||
#undef BCH_TIME_STAT
|
||||
|
||||
bch2_fs_quota_exit(c);
|
||||
bch2_fs_fsio_exit(c);
|
||||
bch2_fs_encryption_exit(c);
|
||||
bch2_fs_io_exit(c);
|
||||
bch2_fs_btree_cache_exit(c);
|
||||
bch2_fs_journal_exit(&c->journal);
|
||||
bch2_io_clock_exit(&c->io_clock[WRITE]);
|
||||
@ -407,10 +415,6 @@ static void bch2_fs_free(struct bch_fs *c)
|
||||
lg_lock_free(&c->usage_lock);
|
||||
free_percpu(c->usage_percpu);
|
||||
mempool_exit(&c->btree_bounce_pool);
|
||||
mempool_exit(&c->bio_bounce_pages);
|
||||
bioset_exit(&c->bio_write);
|
||||
bioset_exit(&c->bio_read_split);
|
||||
bioset_exit(&c->bio_read);
|
||||
bioset_exit(&c->btree_bio);
|
||||
mempool_exit(&c->btree_interior_update_pool);
|
||||
mempool_exit(&c->btree_reserve_pool);
|
||||
@ -561,8 +565,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||
|
||||
init_rwsem(&c->gc_lock);
|
||||
|
||||
#define BCH_TIME_STAT(name, frequency_units, duration_units) \
|
||||
spin_lock_init(&c->name##_time.lock);
|
||||
#define BCH_TIME_STAT(name) \
|
||||
bch2_time_stats_init(&c->name##_time);
|
||||
BCH_TIME_STATS()
|
||||
#undef BCH_TIME_STAT
|
||||
|
||||
@ -587,9 +591,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||
|
||||
seqcount_init(&c->gc_pos_lock);
|
||||
|
||||
c->copy_gc_enabled = 1;
|
||||
c->rebalance_enabled = 1;
|
||||
c->rebalance_percent = 10;
|
||||
c->copy_gc_enabled = 1;
|
||||
c->rebalance_enabled = 1;
|
||||
c->rebalance_percent = 10;
|
||||
c->promote_whole_extents = true;
|
||||
|
||||
c->journal.write_time = &c->journal_write_time;
|
||||
c->journal.delay_time = &c->journal_delay_time;
|
||||
@ -640,17 +645,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||
max(offsetof(struct btree_read_bio, bio),
|
||||
offsetof(struct btree_write_bio, wbio.bio)),
|
||||
BIOSET_NEED_BVECS) ||
|
||||
bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
|
||||
BIOSET_NEED_BVECS) ||
|
||||
bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
|
||||
BIOSET_NEED_BVECS) ||
|
||||
bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
|
||||
BIOSET_NEED_BVECS) ||
|
||||
mempool_init_page_pool(&c->bio_bounce_pages,
|
||||
max_t(unsigned,
|
||||
c->opts.btree_node_size,
|
||||
c->sb.encoded_extent_max) /
|
||||
PAGE_SECTORS, 0) ||
|
||||
!(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
|
||||
lg_lock_init(&c->usage_lock) ||
|
||||
mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
|
||||
@ -658,6 +652,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||
bch2_io_clock_init(&c->io_clock[WRITE]) ||
|
||||
bch2_fs_journal_init(&c->journal) ||
|
||||
bch2_fs_btree_cache_init(c) ||
|
||||
bch2_fs_io_init(c) ||
|
||||
bch2_fs_encryption_init(c) ||
|
||||
bch2_fs_compress_init(c) ||
|
||||
bch2_fs_fsio_init(c))
|
||||
@ -774,11 +769,11 @@ const char *bch2_fs_start(struct bch_fs *c)
|
||||
goto recovery_done;
|
||||
|
||||
/*
|
||||
* bch2_journal_start() can't happen sooner, or btree_gc_finish()
|
||||
* bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
|
||||
* will give spurious errors about oldest_gen > bucket_gen -
|
||||
* this is a hack but oh well.
|
||||
*/
|
||||
bch2_journal_start(c);
|
||||
bch2_fs_journal_start(&c->journal);
|
||||
|
||||
err = "error starting allocator";
|
||||
if (bch2_fs_allocator_start(c))
|
||||
@ -834,7 +829,7 @@ const char *bch2_fs_start(struct bch_fs *c)
|
||||
* journal_res_get() will crash if called before this has
|
||||
* set up the journal.pin FIFO and journal.cur pointer:
|
||||
*/
|
||||
bch2_journal_start(c);
|
||||
bch2_fs_journal_start(&c->journal);
|
||||
bch2_journal_set_replay_done(&c->journal);
|
||||
|
||||
err = "error starting allocator";
|
||||
@ -993,6 +988,9 @@ static void bch2_dev_free(struct bch_dev *ca)
|
||||
bioset_exit(&ca->replica_set);
|
||||
bch2_dev_buckets_free(ca);
|
||||
|
||||
bch2_time_stats_exit(&ca->io_latency[WRITE]);
|
||||
bch2_time_stats_exit(&ca->io_latency[READ]);
|
||||
|
||||
percpu_ref_exit(&ca->io_ref);
|
||||
percpu_ref_exit(&ca->ref);
|
||||
kobject_put(&ca->kobj);
|
||||
@ -1089,6 +1087,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
|
||||
|
||||
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
|
||||
|
||||
bch2_time_stats_init(&ca->io_latency[READ]);
|
||||
bch2_time_stats_init(&ca->io_latency[WRITE]);
|
||||
|
||||
ca->mi = bch2_mi_to_cpu(member);
|
||||
ca->uuid = member->uuid;
|
||||
|
||||
@ -1421,7 +1422,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = bch2_journal_flush_device(&c->journal, ca->dev_idx);
|
||||
ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
|
||||
if (ret) {
|
||||
bch_err(ca, "Remove failed: error %i flushing journal", ret);
|
||||
goto err;
|
||||
|
@ -27,7 +27,26 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
|
||||
|
||||
static inline bool bch2_dev_is_online(struct bch_dev *ca)
|
||||
{
|
||||
return ca->disk_sb.bdev != NULL;
|
||||
return !percpu_ref_is_zero(&ca->io_ref);
|
||||
}
|
||||
|
||||
static inline bool bch2_dev_is_readable(struct bch_dev *ca)
|
||||
{
|
||||
return bch2_dev_is_online(ca) &&
|
||||
ca->mi.state != BCH_MEMBER_STATE_FAILED;
|
||||
}
|
||||
|
||||
static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
|
||||
{
|
||||
if (!percpu_ref_tryget(&ca->io_ref))
|
||||
return false;
|
||||
|
||||
if (ca->mi.state == BCH_MEMBER_STATE_RW ||
|
||||
(ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ))
|
||||
return true;
|
||||
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
|
||||
|
@ -141,11 +141,19 @@ read_attribute(btree_node_size);
|
||||
read_attribute(first_bucket);
|
||||
read_attribute(nbuckets);
|
||||
read_attribute(durability);
|
||||
read_attribute(iostats);
|
||||
read_attribute(last_read_quantiles);
|
||||
read_attribute(last_write_quantiles);
|
||||
read_attribute(fragmentation_quantiles);
|
||||
read_attribute(oldest_gen_quantiles);
|
||||
read_attribute(iodone);
|
||||
|
||||
read_attribute(io_latency_read);
|
||||
read_attribute(io_latency_write);
|
||||
read_attribute(io_latency_stats_read);
|
||||
read_attribute(io_latency_stats_write);
|
||||
read_attribute(congested);
|
||||
|
||||
read_attribute(bucket_quantiles_last_read);
|
||||
read_attribute(bucket_quantiles_last_write);
|
||||
read_attribute(bucket_quantiles_fragmentation);
|
||||
read_attribute(bucket_quantiles_oldest_gen);
|
||||
|
||||
read_attribute(reserve_stats);
|
||||
read_attribute(btree_cache_size);
|
||||
read_attribute(compression_stats);
|
||||
@ -177,6 +185,7 @@ sysfs_pd_controller_attribute(copy_gc);
|
||||
rw_attribute(rebalance_enabled);
|
||||
rw_attribute(rebalance_percent);
|
||||
sysfs_pd_controller_attribute(rebalance);
|
||||
rw_attribute(promote_whole_extents);
|
||||
|
||||
rw_attribute(pd_controllers_update_seconds);
|
||||
|
||||
@ -189,8 +198,9 @@ read_attribute(data_replicas_have);
|
||||
BCH_DEBUG_PARAMS()
|
||||
#undef BCH_DEBUG_PARAM
|
||||
|
||||
#define BCH_TIME_STAT(name, frequency_units, duration_units) \
|
||||
sysfs_time_stats_attribute(name, frequency_units, duration_units);
|
||||
#define BCH_TIME_STAT(_name) \
|
||||
static struct attribute sysfs_time_stat_##_name = \
|
||||
{ .name = #_name, .mode = S_IRUGO };
|
||||
BCH_TIME_STATS()
|
||||
#undef BCH_TIME_STAT
|
||||
|
||||
@ -332,9 +342,10 @@ SHOW(bch2_fs)
|
||||
|
||||
sysfs_printf(rebalance_enabled, "%i", c->rebalance_enabled);
|
||||
sysfs_print(rebalance_percent, c->rebalance_percent);
|
||||
|
||||
sysfs_pd_controller_show(rebalance, &c->rebalance_pd); /* XXX */
|
||||
|
||||
sysfs_print(promote_whole_extents, c->promote_whole_extents);
|
||||
|
||||
sysfs_printf(meta_replicas_have, "%u", bch2_replicas_online(c, true));
|
||||
sysfs_printf(data_replicas_have, "%u", bch2_replicas_online(c, false));
|
||||
|
||||
@ -406,6 +417,8 @@ STORE(__bch2_fs)
|
||||
sysfs_strtoul(rebalance_percent, c->rebalance_percent);
|
||||
sysfs_pd_controller_store(rebalance, &c->rebalance_pd);
|
||||
|
||||
sysfs_strtoul(promote_whole_extents, c->promote_whole_extents);
|
||||
|
||||
/* Debugging: */
|
||||
|
||||
#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name);
|
||||
@ -462,6 +475,7 @@ struct attribute *bch2_fs_files[] = {
|
||||
&sysfs_journal_reclaim_delay_ms,
|
||||
|
||||
&sysfs_rebalance_percent,
|
||||
&sysfs_promote_whole_extents,
|
||||
|
||||
&sysfs_compression_stats,
|
||||
NULL
|
||||
@ -531,9 +545,16 @@ STORE(bch2_fs_opts_dir)
|
||||
struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
|
||||
const struct bch_option *opt = container_of(attr, struct bch_option, attr);
|
||||
int ret, id = opt - bch2_opt_table;
|
||||
char *tmp;
|
||||
u64 v;
|
||||
|
||||
ret = bch2_opt_parse(c, opt, buf, &v);
|
||||
tmp = kstrdup(buf, GFP_KERNEL);
|
||||
if (!tmp)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = bch2_opt_parse(c, opt, strim(tmp), &v);
|
||||
kfree(tmp);
|
||||
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
@ -592,9 +613,9 @@ SHOW(bch2_fs_time_stats)
|
||||
{
|
||||
struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
|
||||
|
||||
#define BCH_TIME_STAT(name, frequency_units, duration_units) \
|
||||
sysfs_print_time_stats(&c->name##_time, name, \
|
||||
frequency_units, duration_units);
|
||||
#define BCH_TIME_STAT(name) \
|
||||
if (attr == &sysfs_time_stat_##name) \
|
||||
return bch2_time_stats_print(&c->name##_time, buf, PAGE_SIZE);
|
||||
BCH_TIME_STATS()
|
||||
#undef BCH_TIME_STAT
|
||||
|
||||
@ -603,23 +624,15 @@ SHOW(bch2_fs_time_stats)
|
||||
|
||||
STORE(bch2_fs_time_stats)
|
||||
{
|
||||
struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
|
||||
|
||||
#define BCH_TIME_STAT(name, frequency_units, duration_units) \
|
||||
sysfs_clear_time_stats(&c->name##_time, name);
|
||||
BCH_TIME_STATS()
|
||||
#undef BCH_TIME_STAT
|
||||
|
||||
return size;
|
||||
}
|
||||
SYSFS_OPS(bch2_fs_time_stats);
|
||||
|
||||
struct attribute *bch2_fs_time_stats_files[] = {
|
||||
#define BCH_TIME_STAT(name, frequency_units, duration_units) \
|
||||
sysfs_time_stats_attribute_list(name, frequency_units, duration_units)
|
||||
#define BCH_TIME_STAT(name) \
|
||||
&sysfs_time_stat_##name,
|
||||
BCH_TIME_STATS()
|
||||
#undef BCH_TIME_STAT
|
||||
|
||||
NULL
|
||||
};
|
||||
|
||||
@ -774,7 +787,7 @@ static const char * const bch2_rw[] = {
|
||||
NULL
|
||||
};
|
||||
|
||||
static ssize_t show_dev_iostats(struct bch_dev *ca, char *buf)
|
||||
static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf)
|
||||
{
|
||||
char *out = buf, *end = buf + PAGE_SIZE;
|
||||
int rw, i, cpu;
|
||||
@ -851,16 +864,28 @@ SHOW(bch2_dev)
|
||||
return out - buf;
|
||||
}
|
||||
|
||||
if (attr == &sysfs_iostats)
|
||||
return show_dev_iostats(ca, buf);
|
||||
if (attr == &sysfs_iodone)
|
||||
return show_dev_iodone(ca, buf);
|
||||
|
||||
if (attr == &sysfs_last_read_quantiles)
|
||||
sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ]));
|
||||
sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE]));
|
||||
|
||||
if (attr == &sysfs_io_latency_stats_read)
|
||||
return bch2_time_stats_print(&ca->io_latency[READ], buf, PAGE_SIZE);
|
||||
if (attr == &sysfs_io_latency_stats_write)
|
||||
return bch2_time_stats_print(&ca->io_latency[WRITE], buf, PAGE_SIZE);
|
||||
|
||||
sysfs_printf(congested, "%u%%",
|
||||
clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
|
||||
* 100 / CONGESTED_MAX);
|
||||
|
||||
if (attr == &sysfs_bucket_quantiles_last_read)
|
||||
return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0);
|
||||
if (attr == &sysfs_last_write_quantiles)
|
||||
if (attr == &sysfs_bucket_quantiles_last_write)
|
||||
return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1);
|
||||
if (attr == &sysfs_fragmentation_quantiles)
|
||||
if (attr == &sysfs_bucket_quantiles_fragmentation)
|
||||
return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL);
|
||||
if (attr == &sysfs_oldest_gen_quantiles)
|
||||
if (attr == &sysfs_bucket_quantiles_oldest_gen)
|
||||
return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL);
|
||||
|
||||
if (attr == &sysfs_reserve_stats)
|
||||
@ -944,13 +969,20 @@ struct attribute *bch2_dev_files[] = {
|
||||
&sysfs_label,
|
||||
|
||||
&sysfs_has_data,
|
||||
&sysfs_iostats,
|
||||
&sysfs_iodone,
|
||||
|
||||
&sysfs_io_latency_read,
|
||||
&sysfs_io_latency_write,
|
||||
&sysfs_io_latency_stats_read,
|
||||
&sysfs_io_latency_stats_write,
|
||||
&sysfs_congested,
|
||||
|
||||
/* alloc info - other stats: */
|
||||
&sysfs_last_read_quantiles,
|
||||
&sysfs_last_write_quantiles,
|
||||
&sysfs_fragmentation_quantiles,
|
||||
&sysfs_oldest_gen_quantiles,
|
||||
&sysfs_bucket_quantiles_last_read,
|
||||
&sysfs_bucket_quantiles_last_write,
|
||||
&sysfs_bucket_quantiles_fragmentation,
|
||||
&sysfs_bucket_quantiles_oldest_gen,
|
||||
|
||||
&sysfs_reserve_stats,
|
||||
|
||||
/* debug: */
|
||||
|
@ -13,12 +13,15 @@
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/log2.h>
|
||||
#include <linux/math64.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/preempt.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/sched/clock.h>
|
||||
|
||||
#include "eytzinger.h"
|
||||
#include "util.h"
|
||||
|
||||
#define simple_strtoint(c, end, base) simple_strtol(c, end, base)
|
||||
@ -200,59 +203,189 @@ bool bch2_is_zero(const void *_p, size_t n)
|
||||
return true;
|
||||
}
|
||||
|
||||
void bch2_time_stats_clear(struct time_stats *stats)
|
||||
void bch2_quantiles_update(struct quantiles *q, u64 v)
|
||||
{
|
||||
spin_lock(&stats->lock);
|
||||
unsigned i = 0;
|
||||
|
||||
stats->count = 0;
|
||||
stats->last_duration = 0;
|
||||
stats->max_duration = 0;
|
||||
stats->average_duration = 0;
|
||||
stats->average_frequency = 0;
|
||||
stats->last = 0;
|
||||
while (i < ARRAY_SIZE(q->entries)) {
|
||||
struct quantile_entry *e = q->entries + i;
|
||||
|
||||
spin_unlock(&stats->lock);
|
||||
if (unlikely(!e->step)) {
|
||||
e->m = v;
|
||||
e->step = max_t(unsigned, v / 2, 1024);
|
||||
} else if (e->m > v) {
|
||||
e->m = e->m >= e->step
|
||||
? e->m - e->step
|
||||
: 0;
|
||||
} else if (e->m < v) {
|
||||
e->m = e->m + e->step > e->m
|
||||
? e->m + e->step
|
||||
: U32_MAX;
|
||||
}
|
||||
|
||||
if ((e->m > v ? e->m - v : v - e->m) < e->step)
|
||||
e->step = max_t(unsigned, e->step / 2, 1);
|
||||
|
||||
if (v >= e->m)
|
||||
break;
|
||||
|
||||
i = eytzinger0_child(i, v > e->m);
|
||||
}
|
||||
}
|
||||
|
||||
void __bch2_time_stats_update(struct time_stats *stats, u64 start_time)
|
||||
/* time stats: */
|
||||
|
||||
static void bch2_time_stats_update_one(struct time_stats *stats,
|
||||
u64 start, u64 end)
|
||||
{
|
||||
u64 now, duration, last;
|
||||
u64 duration, freq;
|
||||
|
||||
duration = time_after64(end, start)
|
||||
? end - start : 0;
|
||||
freq = time_after64(end, stats->last_event)
|
||||
? end - stats->last_event : 0;
|
||||
|
||||
stats->count++;
|
||||
|
||||
now = local_clock();
|
||||
duration = time_after64(now, start_time)
|
||||
? now - start_time : 0;
|
||||
last = time_after64(now, stats->last)
|
||||
? now - stats->last : 0;
|
||||
stats->average_duration = stats->average_duration
|
||||
? ewma_add(stats->average_duration, duration, 6)
|
||||
: duration;
|
||||
|
||||
stats->average_frequency = stats->average_frequency
|
||||
? ewma_add(stats->average_frequency, freq, 6)
|
||||
: freq;
|
||||
|
||||
stats->last_duration = duration;
|
||||
stats->max_duration = max(stats->max_duration, duration);
|
||||
|
||||
if (stats->last) {
|
||||
stats->average_duration = ewma_add(stats->average_duration,
|
||||
duration << 8, 3);
|
||||
stats->last_event = end;
|
||||
|
||||
if (stats->average_frequency)
|
||||
stats->average_frequency =
|
||||
ewma_add(stats->average_frequency,
|
||||
last << 8, 3);
|
||||
else
|
||||
stats->average_frequency = last << 8;
|
||||
bch2_quantiles_update(&stats->quantiles, duration);
|
||||
}
|
||||
|
||||
void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
if (!stats->buffer) {
|
||||
spin_lock_irqsave(&stats->lock, flags);
|
||||
bch2_time_stats_update_one(stats, start, end);
|
||||
|
||||
if (stats->average_frequency < 32 &&
|
||||
stats->count > 1024)
|
||||
stats->buffer =
|
||||
alloc_percpu_gfp(struct time_stat_buffer,
|
||||
GFP_ATOMIC);
|
||||
spin_unlock_irqrestore(&stats->lock, flags);
|
||||
} else {
|
||||
stats->average_duration = duration << 8;
|
||||
struct time_stat_buffer_entry *i;
|
||||
struct time_stat_buffer *b;
|
||||
|
||||
preempt_disable();
|
||||
b = this_cpu_ptr(stats->buffer);
|
||||
|
||||
BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
|
||||
b->entries[b->nr++] = (struct time_stat_buffer_entry) {
|
||||
.start = start,
|
||||
.end = end
|
||||
};
|
||||
|
||||
if (b->nr == ARRAY_SIZE(b->entries)) {
|
||||
spin_lock_irqsave(&stats->lock, flags);
|
||||
for (i = b->entries;
|
||||
i < b->entries + ARRAY_SIZE(b->entries);
|
||||
i++)
|
||||
bch2_time_stats_update_one(stats, i->start, i->end);
|
||||
spin_unlock_irqrestore(&stats->lock, flags);
|
||||
|
||||
b->nr = 0;
|
||||
}
|
||||
|
||||
preempt_enable();
|
||||
}
|
||||
}
|
||||
|
||||
static const struct time_unit {
|
||||
const char *name;
|
||||
u32 nsecs;
|
||||
} time_units[] = {
|
||||
{ "ns", 1 },
|
||||
{ "us", NSEC_PER_USEC },
|
||||
{ "ms", NSEC_PER_MSEC },
|
||||
{ "sec", NSEC_PER_SEC },
|
||||
};
|
||||
|
||||
static const struct time_unit *pick_time_units(u64 ns)
|
||||
{
|
||||
const struct time_unit *u;
|
||||
|
||||
for (u = time_units;
|
||||
u + 1 < time_units + ARRAY_SIZE(time_units) &&
|
||||
ns >= u[1].nsecs << 1;
|
||||
u++)
|
||||
;
|
||||
|
||||
return u;
|
||||
}
|
||||
|
||||
static size_t pr_time_units(char *buf, size_t len, u64 ns)
|
||||
{
|
||||
const struct time_unit *u = pick_time_units(ns);
|
||||
|
||||
return scnprintf(buf, len, "%llu %s", div_u64(ns, u->nsecs), u->name);
|
||||
}
|
||||
|
||||
size_t bch2_time_stats_print(struct time_stats *stats, char *buf, size_t len)
|
||||
{
|
||||
char *out = buf, *end = buf + len;
|
||||
const struct time_unit *u;
|
||||
u64 freq = READ_ONCE(stats->average_frequency);
|
||||
u64 q, last_q = 0;
|
||||
int i;
|
||||
|
||||
out += scnprintf(out, end - out, "count:\t\t%llu\n",
|
||||
stats->count);
|
||||
out += scnprintf(out, end - out, "rate:\t\t%llu/sec\n",
|
||||
freq ? div64_u64(NSEC_PER_SEC, freq) : 0);
|
||||
|
||||
out += scnprintf(out, end - out, "frequency:\t");
|
||||
out += pr_time_units(out, end - out, freq);
|
||||
|
||||
out += scnprintf(out, end - out, "\navg duration:\t");
|
||||
out += pr_time_units(out, end - out, stats->average_duration);
|
||||
|
||||
out += scnprintf(out, end - out, "\nmax duration:\t");
|
||||
out += pr_time_units(out, end - out, stats->max_duration);
|
||||
|
||||
i = eytzinger0_first(NR_QUANTILES);
|
||||
u = pick_time_units(stats->quantiles.entries[i].m);
|
||||
|
||||
out += scnprintf(out, end - out, "\nquantiles (%s):\t", u->name);
|
||||
eytzinger0_for_each(i, NR_QUANTILES) {
|
||||
bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
|
||||
|
||||
q = max(stats->quantiles.entries[i].m, last_q);
|
||||
out += scnprintf(out, end - out, "%llu%s",
|
||||
div_u64(q, u->nsecs),
|
||||
is_last ? "\n" : " ");
|
||||
last_q = q;
|
||||
}
|
||||
|
||||
stats->last = now ?: 1;
|
||||
return out - buf;
|
||||
}
|
||||
|
||||
void bch2_time_stats_update(struct time_stats *stats, u64 start_time)
|
||||
void bch2_time_stats_exit(struct time_stats *stats)
|
||||
{
|
||||
spin_lock(&stats->lock);
|
||||
__bch2_time_stats_update(stats, start_time);
|
||||
spin_unlock(&stats->lock);
|
||||
free_percpu(stats->buffer);
|
||||
}
|
||||
|
||||
void bch2_time_stats_init(struct time_stats *stats)
|
||||
{
|
||||
memset(stats, 0, sizeof(*stats));
|
||||
spin_lock_init(&stats->lock);
|
||||
}
|
||||
|
||||
/* ratelimit: */
|
||||
|
||||
/**
|
||||
* bch2_ratelimit_delay() - return how long to delay until the next time to do
|
||||
* some work
|
||||
@ -310,6 +443,8 @@ int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d)
|
||||
}
|
||||
}
|
||||
|
||||
/* pd controller: */
|
||||
|
||||
/*
|
||||
* Updates pd_controller. Attempts to scale inputed values to units per second.
|
||||
* @target: desired value
|
||||
@ -404,6 +539,8 @@ size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
|
||||
derivative, change, next_io);
|
||||
}
|
||||
|
||||
/* misc: */
|
||||
|
||||
void bch2_bio_map(struct bio *bio, void *base)
|
||||
{
|
||||
size_t size = bio->bi_iter.bi_size;
|
||||
|
@ -371,87 +371,50 @@ ssize_t bch2_read_string_list(const char *, const char * const[]);
|
||||
ssize_t bch2_scnprint_flag_list(char *, size_t, const char * const[], u64);
|
||||
u64 bch2_read_flag_list(char *, const char * const[]);
|
||||
|
||||
#define NR_QUANTILES 15
|
||||
#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
|
||||
#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
|
||||
#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
|
||||
|
||||
struct quantiles {
|
||||
struct quantile_entry {
|
||||
u64 m;
|
||||
u64 step;
|
||||
} entries[NR_QUANTILES];
|
||||
};
|
||||
|
||||
struct time_stat_buffer {
|
||||
unsigned nr;
|
||||
struct time_stat_buffer_entry {
|
||||
u64 start;
|
||||
u64 end;
|
||||
} entries[32];
|
||||
};
|
||||
|
||||
struct time_stats {
|
||||
spinlock_t lock;
|
||||
u64 count;
|
||||
/*
|
||||
* all fields are in nanoseconds, averages are ewmas stored left shifted
|
||||
* by 8
|
||||
*/
|
||||
u64 last_duration;
|
||||
u64 max_duration;
|
||||
/* all fields are in nanoseconds */
|
||||
u64 average_duration;
|
||||
u64 average_frequency;
|
||||
u64 last;
|
||||
u64 max_duration;
|
||||
u64 last_event;
|
||||
struct quantiles quantiles;
|
||||
|
||||
struct time_stat_buffer __percpu *buffer;
|
||||
};
|
||||
|
||||
void bch2_time_stats_clear(struct time_stats *stats);
|
||||
void __bch2_time_stats_update(struct time_stats *stats, u64 time);
|
||||
void bch2_time_stats_update(struct time_stats *stats, u64 time);
|
||||
void __bch2_time_stats_update(struct time_stats *stats, u64, u64);
|
||||
|
||||
static inline unsigned local_clock_us(void)
|
||||
static inline void bch2_time_stats_update(struct time_stats *stats, u64 start)
|
||||
{
|
||||
return local_clock() >> 10;
|
||||
__bch2_time_stats_update(stats, start, local_clock());
|
||||
}
|
||||
|
||||
#define NSEC_PER_ns 1L
|
||||
#define NSEC_PER_us NSEC_PER_USEC
|
||||
#define NSEC_PER_ms NSEC_PER_MSEC
|
||||
#define NSEC_PER_sec NSEC_PER_SEC
|
||||
size_t bch2_time_stats_print(struct time_stats *, char *, size_t);
|
||||
|
||||
#define __print_time_stat(stats, name, stat, units) \
|
||||
sysfs_print(name ## _ ## stat ## _ ## units, \
|
||||
div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
|
||||
|
||||
#define sysfs_print_time_stats(stats, name, \
|
||||
frequency_units, \
|
||||
duration_units) \
|
||||
do { \
|
||||
__print_time_stat(stats, name, \
|
||||
average_frequency, frequency_units); \
|
||||
__print_time_stat(stats, name, \
|
||||
average_duration, duration_units); \
|
||||
sysfs_print(name ## _ ##count, (stats)->count); \
|
||||
sysfs_print(name ## _ ##last_duration ## _ ## duration_units, \
|
||||
div_u64((stats)->last_duration, \
|
||||
NSEC_PER_ ## duration_units)); \
|
||||
sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \
|
||||
div_u64((stats)->max_duration, \
|
||||
NSEC_PER_ ## duration_units)); \
|
||||
\
|
||||
sysfs_print(name ## _last_ ## frequency_units, (stats)->last \
|
||||
? div_s64(local_clock() - (stats)->last, \
|
||||
NSEC_PER_ ## frequency_units) \
|
||||
: -1LL); \
|
||||
} while (0)
|
||||
|
||||
#define sysfs_clear_time_stats(stats, name) \
|
||||
do { \
|
||||
if (attr == &sysfs_ ## name ## _clear) \
|
||||
bch2_time_stats_clear(stats); \
|
||||
} while (0)
|
||||
|
||||
#define sysfs_time_stats_attribute(name, \
|
||||
frequency_units, \
|
||||
duration_units) \
|
||||
write_attribute(name ## _clear); \
|
||||
read_attribute(name ## _count); \
|
||||
read_attribute(name ## _average_frequency_ ## frequency_units); \
|
||||
read_attribute(name ## _average_duration_ ## duration_units); \
|
||||
read_attribute(name ## _last_duration_ ## duration_units); \
|
||||
read_attribute(name ## _max_duration_ ## duration_units); \
|
||||
read_attribute(name ## _last_ ## frequency_units)
|
||||
|
||||
#define sysfs_time_stats_attribute_list(name, \
|
||||
frequency_units, \
|
||||
duration_units) \
|
||||
&sysfs_ ## name ## _clear, \
|
||||
&sysfs_ ## name ## _count, \
|
||||
&sysfs_ ## name ## _average_frequency_ ## frequency_units, \
|
||||
&sysfs_ ## name ## _average_duration_ ## duration_units, \
|
||||
&sysfs_ ## name ## _last_duration_ ## duration_units, \
|
||||
&sysfs_ ## name ## _max_duration_ ## duration_units, \
|
||||
&sysfs_ ## name ## _last_ ## frequency_units,
|
||||
void bch2_time_stats_exit(struct time_stats *);
|
||||
void bch2_time_stats_init(struct time_stats *);
|
||||
|
||||
#define ewma_add(ewma, val, weight) \
|
||||
({ \
|
||||
|
Loading…
Reference in New Issue
Block a user