Update bcachefs sources to e1d0fb8c5f bcachefs: Don't require flush/fua on every journal write

This commit is contained in:
Kent Overstreet 2020-12-04 13:41:49 -05:00
parent d7fdc2b61e
commit db931a4571
28 changed files with 804 additions and 438 deletions

View File

@ -1 +1 @@
00104032654027a8f4406a82d28911b243f19d94
e1d0fb8c5fbc70df1007ebf5d9ab03018dc05275

View File

@ -26,7 +26,6 @@
#define list_for_each_entry(p, h, m) cds_list_for_each_entry(p, h, m)
#define list_for_each_entry_reverse(p, h, m) cds_list_for_each_entry_reverse(p, h, m)
#define list_for_each_entry_safe(p, n, h, m) cds_list_for_each_entry_safe(p, n, h, m)
#define list_for_each_entry_safe_reverse(p, n, h, m) cds_list_for_each_entry_safe_reverse(p, n, h, m)
static inline int list_empty_careful(const struct list_head *head)
{
@ -54,6 +53,15 @@ static inline void list_splice_init(struct list_head *list,
#define list_first_entry_or_null(ptr, type, member) \
(!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL)
#define list_prev_entry(pos, member) \
list_entry((pos)->member.prev, typeof(*(pos)), member)
#define list_for_each_entry_safe_reverse(pos, n, head, member) \
for (pos = list_last_entry(head, typeof(*pos), member), \
n = list_prev_entry(pos, member); \
&pos->member != (head); \
pos = n, n = list_prev_entry(n, member))
/* hlists: */
#include <urcu/hlist.h>

View File

@ -214,9 +214,11 @@
dynamic_fault("bcachefs:meta:write:" name)
#ifdef __KERNEL__
#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name)
#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name)
#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
#else
#define bch2_fmt(_c, fmt) fmt "\n"
#define bch2_fmt(_c, fmt) fmt "\n"
#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum)
#endif
#define bch_info(c, fmt, ...) \
@ -229,8 +231,11 @@
printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err(c, fmt, ...) \
printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err_ratelimited(c, fmt, ...) \
printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
#define bch_verbose(c, fmt, ...) \
do { \
@ -668,6 +673,7 @@ struct bch_fs {
unsigned bucket_size_max;
atomic64_t sectors_available;
struct mutex sectors_available_lock;
struct bch_fs_pcpu __percpu *pcpu;
@ -675,7 +681,7 @@ struct bch_fs {
seqcount_t usage_lock;
struct bch_fs_usage *usage_base;
struct bch_fs_usage __percpu *usage[2];
struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR];
struct bch_fs_usage __percpu *usage_gc;
/* single element mempool: */

View File

@ -1332,14 +1332,16 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
x(extents_above_btree_updates, 12) \
x(btree_updates_journalled, 13) \
x(reflink_inline_data, 14) \
x(new_varint, 15)
x(new_varint, 15) \
x(journal_no_flush, 16)
#define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \
(1ULL << BCH_FEATURE_new_extent_overwrite)| \
(1ULL << BCH_FEATURE_btree_ptr_v2)| \
(1ULL << BCH_FEATURE_extents_above_btree_updates)|\
(1ULL << BCH_FEATURE_new_varint))\
(1ULL << BCH_FEATURE_new_varint)| \
(1ULL << BCH_FEATURE_journal_no_flush))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
@ -1575,6 +1577,7 @@ struct jset {
LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4);
LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5);
LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
#define BCH_JOURNAL_BUCKETS_MIN 8

View File

@ -603,7 +603,6 @@ static int bch2_gc_done(struct bch_fs *c,
struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
struct stripe *dst, *src;
unsigned i;
c->ec_stripes_heap.used = 0;
@ -651,8 +650,8 @@ static int bch2_gc_done(struct bch_fs *c,
}
};
bch2_fs_usage_acc_to_base(c, 0);
bch2_fs_usage_acc_to_base(c, 1);
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
bch2_dev_usage_from_buckets(c);

View File

@ -597,18 +597,25 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
bch2_btree_iter_reinit_node(iter, b);
}
static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
struct btree *b)
{
pr_buf(out, "%s level %u/%u\n ",
bch2_btree_ids[b->c.btree_id],
b->c.level,
c->btree_roots[b->c.btree_id].level);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
}
static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
struct btree *b, struct bset *i,
unsigned offset, int write)
{
pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n"
"pos ",
write ? "before write " : "",
b->c.btree_id, b->c.level,
c->btree_roots[b->c.btree_id].level);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
pr_buf(out, "error validating btree node %sat btree ",
write ? "before write " : "");
btree_pos_to_text(out, c, b);
pr_buf(out, " node offset %u", b->written);
pr_buf(out, "\n node offset %u", b->written);
if (i)
pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s));
}
@ -1104,6 +1111,8 @@ static void btree_node_read_work(struct work_struct *work)
struct btree *b = rb->bio.bi_private;
struct bio *bio = &rb->bio;
struct bch_io_failures failed = { .nr = 0 };
char buf[200];
struct printbuf out;
bool can_retry;
goto start;
@ -1123,8 +1132,10 @@ static void btree_node_read_work(struct work_struct *work)
bio->bi_status = BLK_STS_REMOVED;
}
start:
bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s",
bch2_blk_status_to_str(bio->bi_status));
out = PBUF(buf);
btree_pos_to_text(&out, c, b);
bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
bch2_blk_status_to_str(bio->bi_status), buf);
if (rb->have_ioref)
percpu_ref_put(&ca->io_ref);
rb->have_ioref = false;
@ -1408,7 +1419,7 @@ static void btree_node_write_endio(struct bio *bio)
if (wbio->have_ioref)
bch2_latency_acct(ca, wbio->submit_time, WRITE);
if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s",
if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s",
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("btree")) {
spin_lock_irqsave(&c->btree_write_error_lock, flags);
@ -1488,6 +1499,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
if (!btree_node_may_write(b))
return;
if (old & (1 << BTREE_NODE_never_write))
return;
if (old & (1 << BTREE_NODE_write_in_flight)) {
btree_node_wait_on_io(b);
continue;
@ -1534,6 +1548,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
seq = max(seq, le64_to_cpu(i->journal_seq));
}
BUG_ON(b->written && !seq);
/* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
bytes += 8;

View File

@ -415,6 +415,7 @@ enum btree_flags {
BTREE_NODE_fake,
BTREE_NODE_old_extent_overwrite,
BTREE_NODE_need_rewrite,
BTREE_NODE_never_write,
};
BTREE_FLAG(read_in_flight);
@ -429,6 +430,7 @@ BTREE_FLAG(dying);
BTREE_FLAG(fake);
BTREE_FLAG(old_extent_overwrite);
BTREE_FLAG(need_rewrite);
BTREE_FLAG(never_write);
static inline struct btree_write *btree_current_write(struct btree *b)
{

View File

@ -603,17 +603,30 @@ err:
list_del(&as->write_blocked_list);
if (!ret && as->b == b) {
/*
* Node might have been freed, recheck under
* btree_interior_update_lock:
*/
if (as->b == b) {
struct bset *i = btree_bset_last(b);
BUG_ON(!b->c.level);
BUG_ON(!btree_node_dirty(b));
i->journal_seq = cpu_to_le64(
max(journal_seq,
le64_to_cpu(i->journal_seq)));
if (!ret) {
i->journal_seq = cpu_to_le64(
max(journal_seq,
le64_to_cpu(i->journal_seq)));
bch2_btree_add_journal_pin(c, b, journal_seq);
bch2_btree_add_journal_pin(c, b, journal_seq);
} else {
/*
* If we didn't get a journal sequence number we
* can't write this btree node, because recovery
* won't know to ignore this write:
*/
set_btree_node_never_write(b);
}
}
mutex_unlock(&c->btree_interior_update_lock);

View File

@ -649,13 +649,13 @@ int bch2_trans_commit_error(struct btree_trans *trans,
case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
bch2_trans_unlock(trans);
while (bch2_btree_key_cache_must_wait(c)) {
do {
mutex_lock(&c->journal.reclaim_lock);
bch2_journal_reclaim(&c->journal);
ret = bch2_journal_reclaim(&c->journal);
mutex_unlock(&c->journal.reclaim_lock);
}
} while (!ret && bch2_btree_key_cache_must_wait(c));
if (bch2_trans_relock(trans))
if (!ret && bch2_trans_relock(trans))
return 0;
trace_trans_restart_journal_reclaim(trans->ip);

View File

@ -142,8 +142,8 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
percpu_down_write(&c->mark_lock);
usage = c->usage_base;
bch2_fs_usage_acc_to_base(c, 0);
bch2_fs_usage_acc_to_base(c, 1);
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
for (i = 0; i < BCH_REPLICAS_MAX; i++)
usage->reserved += usage->persistent_reserved[i];
@ -207,13 +207,13 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
{
return this_cpu_ptr(gc
? c->usage_gc
: c->usage[journal_seq & 1]);
: c->usage[journal_seq & JOURNAL_BUF_MASK]);
}
u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
{
ssize_t offset = v - (u64 *) c->usage_base;
unsigned seq;
unsigned i, seq;
u64 ret;
BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
@ -221,9 +221,10 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
do {
seq = read_seqcount_begin(&c->usage_lock);
ret = *v +
percpu_u64_get((u64 __percpu *) c->usage[0] + offset) +
percpu_u64_get((u64 __percpu *) c->usage[1] + offset);
ret = *v;
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
} while (read_seqcount_retry(&c->usage_lock, seq));
return ret;
@ -232,7 +233,7 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
{
struct bch_fs_usage *ret;
unsigned seq, v, u64s = fs_usage_u64s(c);
unsigned seq, i, v, u64s = fs_usage_u64s(c);
retry:
ret = kmalloc(u64s * sizeof(u64), GFP_NOFS);
if (unlikely(!ret))
@ -251,8 +252,8 @@ retry:
do {
seq = read_seqcount_begin(&c->usage_lock);
memcpy(ret, c->usage_base, u64s * sizeof(u64));
acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s);
acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s);
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[i], u64s);
} while (read_seqcount_retry(&c->usage_lock, seq));
return ret;
@ -262,7 +263,7 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
{
unsigned u64s = fs_usage_u64s(c);
BUG_ON(idx >= 2);
BUG_ON(idx >= ARRAY_SIZE(c->usage));
preempt_disable();
write_seqcount_begin(&c->usage_lock);
@ -2031,13 +2032,6 @@ int bch2_trans_mark_update(struct btree_trans *trans,
/* Disk reservations: */
static u64 bch2_recalc_sectors_available(struct bch_fs *c)
{
percpu_u64_set(&c->pcpu->sectors_available, 0);
return avail_factor(__bch2_fs_usage_read_short(c).free);
}
void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
{
percpu_down_read(&c->mark_lock);
@ -2072,7 +2066,6 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
if (get < sectors) {
preempt_enable();
percpu_up_read(&c->mark_lock);
goto recalculate;
}
} while ((v = atomic64_cmpxchg(&c->sectors_available,
@ -2090,9 +2083,10 @@ out:
return 0;
recalculate:
percpu_down_write(&c->mark_lock);
mutex_lock(&c->sectors_available_lock);
sectors_available = bch2_recalc_sectors_available(c);
percpu_u64_set(&c->pcpu->sectors_available, 0);
sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
if (sectors <= sectors_available ||
(flags & BCH_DISK_RESERVATION_NOFAIL)) {
@ -2106,7 +2100,8 @@ recalculate:
ret = -ENOSPC;
}
percpu_up_write(&c->mark_lock);
mutex_unlock(&c->sectors_available_lock);
percpu_up_read(&c->mark_lock);
return ret;
}

View File

@ -264,7 +264,7 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
len << 9);
if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) {
__bcache_io_error(c,
bch_err_ratelimited(c,
"checksum error while doing reconstruct read (%u:%u)",
i, j);
clear_bit(i, buf->valid);
@ -305,7 +305,7 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
unsigned bytes = buf->size << 9;
if (ec_nr_failed(buf) > v->nr_redundant) {
__bcache_io_error(c,
bch_err_ratelimited(c,
"error doing reconstruct read: unable to read enough blocks");
return -1;
}
@ -326,7 +326,7 @@ static void ec_block_endio(struct bio *bio)
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s",
if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s",
bio_data_dir(bio) ? "write" : "read",
bch2_blk_status_to_str(bio->bi_status)))
clear_bit(ec_bio->idx, ec_bio->buf->valid);
@ -420,7 +420,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
BTREE_ITER_SLOTS);
k = bch2_btree_iter_peek_slot(iter);
if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) {
__bcache_io_error(c,
bch_err_ratelimited(c,
"error doing reconstruct read: stripe not found");
kfree(buf);
return bch2_trans_exit(&trans) ?: -EIO;
@ -462,7 +462,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (ptr_stale(ca, ptr)) {
__bcache_io_error(c,
bch_err_ratelimited(c,
"error doing reconstruct read: stale pointer");
clear_bit(i, buf->valid);
continue;
@ -474,7 +474,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
closure_sync(&cl);
if (ec_nr_failed(buf) > v->nr_redundant) {
__bcache_io_error(c,
bch_err_ratelimited(c,
"error doing reconstruct read: unable to read enough blocks");
ret = -EIO;
goto err;

View File

@ -181,12 +181,18 @@ void bch2_io_error(struct bch_dev *);
/* Logs message and handles the error: */
#define bch2_dev_io_error(ca, fmt, ...) \
do { \
printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \
"IO error on %s for " fmt), \
printk_ratelimited(KERN_ERR "bcachefs (%s): " fmt, \
(ca)->name, ##__VA_ARGS__); \
bch2_io_error(ca); \
} while (0)
#define bch2_dev_inum_io_error(ca, _inum, _offset, fmt, ...) \
do { \
printk_ratelimited(KERN_ERR "bcachefs (%s inum %llu offset %llu): " fmt,\
(ca)->name, (_inum), (_offset), ##__VA_ARGS__); \
bch2_io_error(ca); \
} while (0)
#define bch2_dev_io_err_on(cond, ca, ...) \
({ \
bool _ret = (cond); \
@ -196,16 +202,13 @@ do { \
_ret; \
})
/* kill? */
#define __bcache_io_error(c, fmt, ...) \
printk_ratelimited(KERN_ERR bch2_fmt(c, \
"IO error: " fmt), ##__VA_ARGS__)
#define bcache_io_error(c, bio, fmt, ...) \
do { \
__bcache_io_error(c, fmt, ##__VA_ARGS__); \
(bio)->bi_status = BLK_STS_IOERR; \
} while (0)
#define bch2_dev_inum_io_err_on(cond, ca, _inum, _offset, ...) \
({ \
bool _ret = (cond); \
\
if (_ret) \
bch2_dev_inum_io_error(ca, _inum, _offset, __VA_ARGS__);\
_ret; \
})
#endif /* _BCACHEFS_ERROR_H */

View File

@ -856,7 +856,9 @@ retry:
goto retry;
if (ret) {
bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
bch_err_inum_ratelimited(c, inum,
"read error %i from btree lookup", ret);
rbio->bio.bi_status = BLK_STS_IOERR;
bio_endio(&rbio->bio);
}
@ -1013,6 +1015,8 @@ static void bch2_writepage_io_done(struct closure *cl)
unsigned i;
if (io->op.error) {
set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
@ -1902,7 +1906,13 @@ loop:
bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
if (!dio->iter.count || dio->op.error)
if (dio->op.error) {
set_bit(EI_INODE_ERROR, &inode->ei_flags);
break;
}
if (!dio->iter.count)
break;
bio_reset(bio);
@ -2290,7 +2300,8 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
if (ret)
goto err;
BUG_ON(inode->v.i_size < inode_u.bi_size);
WARN_ON(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
inode->v.i_size < inode_u.bi_size);
if (iattr->ia_size > inode->v.i_size) {
ret = bch2_extend(inode, &inode_u, iattr);

View File

@ -1151,6 +1151,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
inode->v.i_generation = bi->bi_generation;
inode->v.i_size = bi->bi_size;
inode->ei_flags = 0;
inode->ei_journal_seq = 0;
inode->ei_quota_reserved = 0;
inode->ei_str_hash = bch2_hash_info_init(c, bi);

View File

@ -33,6 +33,7 @@ void bch2_pagecache_block_get(struct pagecache_lock *);
struct bch_inode_info {
struct inode v;
unsigned long ei_flags;
struct mutex ei_update_lock;
u64 ei_journal_seq;
@ -50,6 +51,12 @@ struct bch_inode_info {
struct bch_inode_unpacked ei_inode;
};
/*
* Set if we've gotten a btree error for this inode, and thus the vfs inode and
* btree inode may be inconsistent:
*/
#define EI_INODE_ERROR 0
#define to_bch_ei(_inode) \
container_of_or_null(_inode, struct bch_inode_info, v)

View File

@ -576,7 +576,8 @@ static void __bch2_write_index(struct bch_write_op *op)
op->written += sectors_start - keylist_sectors(keys);
if (ret) {
__bcache_io_error(c, "btree IO error %i", ret);
bch_err_inum_ratelimited(c, op->pos.inode,
"write error %i from btree update", ret);
op->error = ret;
}
}
@ -621,7 +622,10 @@ static void bch2_write_endio(struct bio *bio)
struct bch_fs *c = wbio->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s",
if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
op->pos.inode,
op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */
"data write error: %s",
bch2_blk_status_to_str(bio->bi_status)))
set_bit(wbio->dev, op->failed.d);
@ -1279,15 +1283,14 @@ void bch2_write(struct closure *cl)
wbio_init(bio)->put_bio = false;
if (bio_sectors(bio) & (c->opts.block_size - 1)) {
__bcache_io_error(c, "misaligned write");
bch_err_inum_ratelimited(c, op->pos.inode,
"misaligned write");
op->error = -EIO;
goto err;
}
if (c->opts.nochanges ||
!percpu_ref_tryget(&c->writes)) {
if (!(op->flags & BCH_WRITE_FROM_INTERNAL))
__bcache_io_error(c, "read only");
op->error = -EROFS;
goto err;
}
@ -1716,7 +1719,8 @@ retry:
* reading a btree node
*/
BUG_ON(!ret);
__bcache_io_error(c, "btree IO error: %i", ret);
bch_err_inum_ratelimited(c, inode,
"read error %i from btree lookup", ret);
err:
rbio->bio.bi_status = BLK_STS_IOERR;
out:
@ -1920,17 +1924,15 @@ csum_err:
return;
}
bch2_dev_io_error(ca,
"data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
bch2_dev_inum_io_error(ca, rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
"data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)",
rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
csum.hi, csum.lo, crc.csum_type);
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
return;
decompression_err:
__bcache_io_error(c, "decompression error, inode %llu offset %llu",
rbio->pos.inode,
(u64) rbio->bvec_iter.bi_sector);
bch_err_inum_ratelimited(c, rbio->pos.inode,
"decompression error");
bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
return;
}
@ -1952,7 +1954,14 @@ static void bch2_read_endio(struct bio *bio)
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s",
/*
* XXX: rbio->pos is not what we want here when reading from indirect
* extents
*/
if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
rbio->pos.inode,
rbio->pos.offset,
"data read error: %s",
bch2_blk_status_to_str(bio->bi_status))) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
return;
@ -2002,7 +2011,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
if (k.k->type != KEY_TYPE_reflink_v &&
k.k->type != KEY_TYPE_indirect_inline_data) {
__bcache_io_error(trans->c,
bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode,
"pointer to nonexistent indirect extent");
ret = -EIO;
goto err;
@ -2048,7 +2057,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
goto hole;
if (pick_ret < 0) {
__bcache_io_error(c, "no device to read from");
bch_err_inum_ratelimited(c, k.k->p.inode,
"no device to read from");
goto err;
}
@ -2198,7 +2208,8 @@ get_bio:
if (!rbio->pick.idx) {
if (!rbio->have_ioref) {
__bcache_io_error(c, "no device to read from");
bch_err_inum_ratelimited(c, k.k->p.inode,
"no device to read from");
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
}
@ -2348,7 +2359,9 @@ err:
if (ret == -EINTR)
goto retry;
bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
bch_err_inum_ratelimited(c, inode,
"read error %i from btree lookup", ret);
rbio->bio.bi_status = BLK_STS_IOERR;
bch2_rbio_done(rbio);
goto out;
}

View File

@ -24,7 +24,7 @@ static u64 last_unwritten_seq(struct journal *j)
lockdep_assert_held(&j->lock);
return journal_cur_seq(j) - s.prev_buf_unwritten;
return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK);
}
static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
@ -52,7 +52,7 @@ journal_seq_to_buf(struct journal *j, u64 seq)
j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
if (journal_seq_unwritten(j, seq)) {
buf = j->buf + (seq & 1);
buf = j->buf + (seq & JOURNAL_BUF_MASK);
EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
}
return buf;
@ -80,6 +80,8 @@ static void bch2_journal_buf_init(struct journal *j)
struct journal_buf *buf = journal_cur_buf(j);
bkey_extent_init(&buf->key);
buf->noflush = false;
buf->must_flush = false;
memset(buf->has_inode, 0, sizeof(buf->has_inode));
@ -109,15 +111,8 @@ void bch2_journal_halt(struct journal *j)
/* journal entry close/open: */
void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set)
void __bch2_journal_buf_put(struct journal *j)
{
if (!need_write_just_set &&
test_bit(JOURNAL_NEED_WRITE, &j->flags))
bch2_time_stats_update(j->delay_time,
j->need_write_time);
clear_bit(JOURNAL_NEED_WRITE, &j->flags);
closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
}
@ -130,7 +125,6 @@ static bool __journal_entry_close(struct journal *j)
struct journal_buf *buf = journal_cur_buf(j);
union journal_res_state old, new;
u64 v = atomic64_read(&j->reservations.counter);
bool set_need_write = false;
unsigned sectors;
lockdep_assert_held(&j->lock);
@ -149,15 +143,13 @@ static bool __journal_entry_close(struct journal *j)
if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
set_bit(JOURNAL_NEED_WRITE, &j->flags);
j->need_write_time = local_clock();
set_need_write = true;
}
if (new.prev_buf_unwritten)
return false;
new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
new.idx++;
new.prev_buf_unwritten = 1;
if (new.idx == new.unwritten_idx)
return false;
BUG_ON(journal_state_count(new, new.idx));
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
@ -191,24 +183,44 @@ static bool __journal_entry_close(struct journal *j)
*/
buf->data->last_seq = cpu_to_le64(journal_last_seq(j));
__bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
journal_pin_new_entry(j, 1);
bch2_journal_buf_init(j);
cancel_delayed_work(&j->write_work);
clear_bit(JOURNAL_NEED_WRITE, &j->flags);
bch2_journal_space_available(j);
bch2_journal_buf_put(j, old.idx, set_need_write);
bch2_journal_buf_put(j, old.idx);
return true;
}
static bool journal_entry_want_write(struct journal *j)
{
union journal_res_state s = READ_ONCE(j->reservations);
bool ret = false;
/*
* Don't close it yet if we already have a write in flight, but do set
* NEED_WRITE:
*/
if (s.idx != s.unwritten_idx)
set_bit(JOURNAL_NEED_WRITE, &j->flags);
else
ret = __journal_entry_close(j);
return ret;
}
static bool journal_entry_close(struct journal *j)
{
bool ret;
spin_lock(&j->lock);
ret = __journal_entry_close(j);
ret = journal_entry_want_write(j);
spin_unlock(&j->lock);
return ret;
@ -290,8 +302,8 @@ static int journal_entry_open(struct journal *j)
static bool journal_quiesced(struct journal *j)
{
union journal_res_state state = READ_ONCE(j->reservations);
bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state);
union journal_res_state s = READ_ONCE(j->reservations);
bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s);
if (!ret)
journal_entry_close(j);
@ -318,17 +330,29 @@ static void journal_write_work(struct work_struct *work)
u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
{
size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
u64 seq = 0;
union journal_res_state s;
unsigned i;
u64 seq;
if (!test_bit(h, j->buf[0].has_inode) &&
!test_bit(h, j->buf[1].has_inode))
return 0;
spin_lock(&j->lock);
if (test_bit(h, journal_cur_buf(j)->has_inode))
seq = journal_cur_seq(j);
else if (test_bit(h, journal_prev_buf(j)->has_inode))
seq = journal_cur_seq(j) - 1;
seq = journal_cur_seq(j);
s = READ_ONCE(j->reservations);
i = s.idx;
while (1) {
if (test_bit(h, j->buf[i].has_inode))
goto out;
if (i == s.unwritten_idx)
break;
i = (i - 1) & JOURNAL_BUF_MASK;
seq--;
}
seq = 0;
out:
spin_unlock(&j->lock);
return seq;
@ -553,7 +577,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
struct journal_buf *buf;
int ret = 0;
if (seq <= j->seq_ondisk)
if (seq <= j->flushed_seq_ondisk)
return 1;
spin_lock(&j->lock);
@ -564,18 +588,55 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
goto out;
}
if (seq <= j->seq_ondisk) {
if (seq <= j->flushed_seq_ondisk) {
ret = 1;
goto out;
}
if (parent &&
(buf = journal_seq_to_buf(j, seq)))
if (!closure_wait(&buf->wait, parent))
/* if seq was written, but not flushed - flush a newer one instead */
seq = max(seq, last_unwritten_seq(j));
recheck_need_open:
if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) {
struct journal_res res = { 0 };
spin_unlock(&j->lock);
ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
if (ret)
return ret;
seq = res.seq;
buf = j->buf + (seq & JOURNAL_BUF_MASK);
buf->must_flush = true;
set_bit(JOURNAL_NEED_WRITE, &j->flags);
if (parent && !closure_wait(&buf->wait, parent))
BUG();
bch2_journal_res_put(j, &res);
spin_lock(&j->lock);
goto want_write;
}
/*
* if write was kicked off without a flush, flush the next sequence
* number instead
*/
buf = journal_seq_to_buf(j, seq);
if (buf->noflush) {
seq++;
goto recheck_need_open;
}
buf->must_flush = true;
if (parent && !closure_wait(&buf->wait, parent))
BUG();
want_write:
if (seq == journal_cur_seq(j))
__journal_entry_close(j);
journal_entry_want_write(j);
out:
spin_unlock(&j->lock);
return ret;
@ -864,15 +925,18 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
{
union journal_res_state state;
struct journal_buf *w;
bool ret;
bool ret = false;
unsigned i;
spin_lock(&j->lock);
state = READ_ONCE(j->reservations);
w = j->buf + !state.idx;
i = state.idx;
ret = state.prev_buf_unwritten &&
bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx);
while (i != state.unwritten_idx) {
i = (i - 1) & JOURNAL_BUF_MASK;
if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx))
ret = true;
}
spin_unlock(&j->lock);
return ret;
@ -955,10 +1019,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
spin_lock(&j->lock);
set_bit(JOURNAL_STARTED, &j->flags);
j->last_flush_write = jiffies;
journal_pin_new_entry(j, 1);
j->reservations.idx = journal_cur_seq(j);
j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
bch2_journal_buf_init(j);
@ -1013,8 +1078,10 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
void bch2_fs_journal_exit(struct journal *j)
{
kvpfree(j->buf[1].data, j->buf[1].buf_size);
kvpfree(j->buf[0].data, j->buf[0].buf_size);
unsigned i;
for (i = 0; i < ARRAY_SIZE(j->buf); i++)
kvpfree(j->buf[i].data, j->buf[i].buf_size);
free_fifo(&j->pin);
}
@ -1022,6 +1089,7 @@ int bch2_fs_journal_init(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
static struct lock_class_key res_key;
unsigned i;
int ret = 0;
pr_verbose_init(c->opts, "");
@ -1036,8 +1104,6 @@ int bch2_fs_journal_init(struct journal *j)
lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN;
j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN;
j->write_delay_ms = 1000;
j->reclaim_delay_ms = 100;
@ -1049,13 +1115,20 @@ int bch2_fs_journal_init(struct journal *j)
((union journal_res_state)
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
!(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) ||
!(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) {
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) {
ret = -ENOMEM;
goto out;
}
for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
if (!j->buf[i].data) {
ret = -ENOMEM;
goto out;
}
}
j->pin.front = j->pin.back = 1;
out:
pr_verbose_init(c->opts, "ret %i", ret);
@ -1069,7 +1142,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
union journal_res_state s;
struct bch_dev *ca;
unsigned iter;
unsigned i;
rcu_read_lock();
spin_lock(&j->lock);
@ -1081,6 +1154,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
"last_seq:\t\t%llu\n"
"last_seq_ondisk:\t%llu\n"
"prereserved:\t\t%u/%u\n"
"nr flush writes:\t%llu\n"
"nr noflush writes:\t%llu\n"
"nr direct reclaim:\t%llu\n"
"nr background reclaim:\t%llu\n"
"current entry sectors:\t%u\n"
@ -1092,6 +1167,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
j->last_seq_ondisk,
j->prereserved.reserved,
j->prereserved.remaining,
j->nr_flush_writes,
j->nr_noflush_writes,
j->nr_direct_reclaim,
j->nr_background_reclaim,
j->cur_entry_sectors,
@ -1112,16 +1189,16 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
}
pr_buf(out,
"current entry refs:\t%u\n"
"prev entry unwritten:\t",
journal_state_count(s, s.idx));
"current entry:\t\tidx %u refcount %u\n",
s.idx, journal_state_count(s, s.idx));
if (s.prev_buf_unwritten)
pr_buf(out, "yes, ref %u sectors %u\n",
journal_state_count(s, !s.idx),
journal_prev_buf(j)->sectors);
else
pr_buf(out, "no\n");
i = s.idx;
while (i != s.unwritten_idx) {
i = (i - 1) & JOURNAL_BUF_MASK;
pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n",
i, journal_state_count(s, i), j->buf[i].sectors);
}
pr_buf(out,
"need write:\t\t%i\n"
@ -1129,7 +1206,21 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
test_bit(JOURNAL_NEED_WRITE, &j->flags),
test_bit(JOURNAL_REPLAY_DONE, &j->flags));
for_each_member_device_rcu(ca, c, iter,
pr_buf(out, "space:\n");
pr_buf(out, "\tdiscarded\t%u:%u\n",
j->space[journal_space_discarded].next_entry,
j->space[journal_space_discarded].total);
pr_buf(out, "\tclean ondisk\t%u:%u\n",
j->space[journal_space_clean_ondisk].next_entry,
j->space[journal_space_clean_ondisk].total);
pr_buf(out, "\tclean\t\t%u:%u\n",
j->space[journal_space_clean].next_entry,
j->space[journal_space_clean].total);
pr_buf(out, "\ttotal\t\t%u:%u\n",
j->space[journal_space_total].next_entry,
j->space[journal_space_total].total);
for_each_member_device_rcu(ca, c, i,
&c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
@ -1139,12 +1230,13 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
pr_buf(out,
"dev %u:\n"
"\tnr\t\t%u\n"
"\tbucket size\t%u\n"
"\tavailable\t%u:%u\n"
"\tdiscard_idx\t\t%u\n"
"\tdirty_idx_ondisk\t%u (seq %llu)\n"
"\tdirty_idx\t\t%u (seq %llu)\n"
"\tdiscard_idx\t%u\n"
"\tdirty_ondisk\t%u (seq %llu)\n"
"\tdirty_idx\t%u (seq %llu)\n"
"\tcur_idx\t\t%u (seq %llu)\n",
iter, ja->nr,
i, ja->nr, ca->mi.bucket_size,
bch2_journal_dev_buckets_available(j, ja, journal_space_discarded),
ja->sectors_free,
ja->discard_idx,

View File

@ -127,11 +127,6 @@ static inline struct journal_buf *journal_cur_buf(struct journal *j)
return j->buf + j->reservations.idx;
}
static inline struct journal_buf *journal_prev_buf(struct journal *j)
{
return j->buf + !j->reservations.idx;
}
/* Sequence number of oldest dirty journal entry */
static inline u64 journal_last_seq(struct journal *j)
@ -141,7 +136,7 @@ static inline u64 journal_last_seq(struct journal *j)
static inline u64 journal_cur_seq(struct journal *j)
{
BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
return j->pin.back - 1;
}
@ -151,13 +146,21 @@ void bch2_journal_set_has_inum(struct journal *, u64, u64);
static inline int journal_state_count(union journal_res_state s, int idx)
{
return idx == 0 ? s.buf0_count : s.buf1_count;
switch (idx) {
case 0: return s.buf0_count;
case 1: return s.buf1_count;
case 2: return s.buf2_count;
case 3: return s.buf3_count;
}
BUG();
}
static inline void journal_state_inc(union journal_res_state *s)
{
s->buf0_count += s->idx == 0;
s->buf1_count += s->idx == 1;
s->buf2_count += s->idx == 2;
s->buf3_count += s->idx == 3;
}
static inline void bch2_journal_set_has_inode(struct journal *j,
@ -255,21 +258,24 @@ static inline bool journal_entry_empty(struct jset *j)
return true;
}
void __bch2_journal_buf_put(struct journal *, bool);
void __bch2_journal_buf_put(struct journal *);
static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
bool need_write_just_set)
static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
{
union journal_res_state s;
s.v = atomic64_sub_return(((union journal_res_state) {
.buf0_count = idx == 0,
.buf1_count = idx == 1,
.buf2_count = idx == 2,
.buf3_count = idx == 3,
}).v, &j->reservations.counter);
if (!journal_state_count(s, idx)) {
EBUG_ON(s.idx == idx || !s.prev_buf_unwritten);
__bch2_journal_buf_put(j, need_write_just_set);
}
EBUG_ON(((s.idx - idx) & 3) >
((s.idx - s.unwritten_idx) & 3));
if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
__bch2_journal_buf_put(j);
}
/*
@ -289,7 +295,7 @@ static inline void bch2_journal_res_put(struct journal *j,
BCH_JSET_ENTRY_btree_keys,
0, 0, NULL, 0);
bch2_journal_buf_put(j, res->idx, false);
bch2_journal_buf_put(j, res->idx);
res->ref = 0;
}
@ -325,11 +331,18 @@ static inline int journal_res_get_fast(struct journal *j,
!test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
return 0;
if (flags & JOURNAL_RES_GET_CHECK)
return 1;
new.cur_entry_offset += res->u64s;
journal_state_inc(&new);
/*
* If the refcount would overflow, we have to wait:
* XXX - tracepoint this:
*/
if (!journal_state_count(new, new.idx))
return 0;
if (flags & JOURNAL_RES_GET_CHECK)
return 1;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);

View File

@ -10,10 +10,27 @@
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
#include "replicas.h"
#include <trace/events/bcachefs.h>
static void __journal_replay_free(struct journal_replay *i)
{
list_del(&i->list);
kvpfree(i, offsetof(struct journal_replay, j) +
vstruct_bytes(&i->j));
}
static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
{
i->ignore = true;
if (!c->opts.read_entire_journal)
__journal_replay_free(i);
}
struct journal_list {
struct closure cl;
struct mutex lock;
@ -36,28 +53,29 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
struct bch_devs_list devs = { .nr = 0 };
struct list_head *where;
size_t bytes = vstruct_bytes(j);
__le64 last_seq;
u64 last_seq = 0;
int ret;
last_seq = !list_empty(jlist->head)
? list_last_entry(jlist->head, struct journal_replay,
list)->j.last_seq
: 0;
if (!c->opts.read_entire_journal) {
/* Is this entry older than the range we need? */
if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
goto out;
list_for_each_entry_reverse(i, jlist->head, list) {
if (!JSET_NO_FLUSH(&i->j)) {
last_seq = le64_to_cpu(i->j.last_seq);
break;
}
}
/* Drop entries we don't need anymore */
/* Is this entry older than the range we need? */
if (!c->opts.read_entire_journal &&
le64_to_cpu(j->seq) < last_seq) {
ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
goto out;
}
/* Drop entries we don't need anymore */
if (!JSET_NO_FLUSH(j)) {
list_for_each_entry_safe(i, pos, jlist->head, list) {
if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
break;
list_del(&i->list);
kvpfree(i, offsetof(struct journal_replay, j) +
vstruct_bytes(&i->j));
journal_replay_free(c, i);
}
}
@ -81,9 +99,7 @@ add:
if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
if (i->bad) {
devs = i->devs;
list_del(&i->list);
kvpfree(i, offsetof(struct journal_replay, j) +
vstruct_bytes(&i->j));
__journal_replay_free(i);
} else if (bad) {
goto found;
} else {
@ -105,6 +121,7 @@ add:
list_add(&i->list, where);
i->devs = devs;
i->bad = bad;
i->ignore = false;
memcpy(&i->j, j, bytes);
found:
if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
@ -558,7 +575,7 @@ reread:
bio_put(bio);
if (bch2_dev_io_err_on(ret, ca,
"journal read from sector %llu",
"journal read error: sector %llu",
offset) ||
bch2_meta_read_fault("journal"))
return -EIO;
@ -699,14 +716,16 @@ err:
goto out;
}
int bch2_journal_read(struct bch_fs *c, struct list_head *list)
int bch2_journal_read(struct bch_fs *c, struct list_head *list,
u64 *blacklist_seq, u64 *start_seq)
{
struct journal_list jlist;
struct journal_replay *i;
struct journal_replay *i, *t;
struct bch_dev *ca;
unsigned iter;
size_t keys = 0, entries = 0;
bool degraded = false;
u64 seq, last_seq = 0;
int ret = 0;
closure_init_stack(&jlist.cl);
@ -735,12 +754,97 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
if (jlist.ret)
return jlist.ret;
if (list_empty(list)) {
bch_info(c, "journal read done, but no entries found");
return 0;
}
i = list_last_entry(list, struct journal_replay, list);
*start_seq = le64_to_cpu(i->j.seq) + 1;
/*
* Find most recent flush entry, and ignore newer non flush entries -
* those entries will be blacklisted:
*/
list_for_each_entry_safe_reverse(i, t, list, list) {
if (i->ignore)
continue;
if (!JSET_NO_FLUSH(&i->j)) {
last_seq = le64_to_cpu(i->j.last_seq);
*blacklist_seq = le64_to_cpu(i->j.seq) + 1;
break;
}
journal_replay_free(c, i);
}
if (!last_seq) {
fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
return -1;
}
/* Drop blacklisted entries and entries older than last_seq: */
list_for_each_entry_safe(i, t, list, list) {
if (i->ignore)
continue;
seq = le64_to_cpu(i->j.seq);
if (seq < last_seq) {
journal_replay_free(c, i);
continue;
}
if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
"found blacklisted journal entry %llu", seq);
journal_replay_free(c, i);
}
}
/* Check for missing entries: */
seq = last_seq;
list_for_each_entry(i, list, list) {
if (i->ignore)
continue;
BUG_ON(seq > le64_to_cpu(i->j.seq));
while (seq < le64_to_cpu(i->j.seq)) {
u64 missing_start, missing_end;
while (seq < le64_to_cpu(i->j.seq) &&
bch2_journal_seq_is_blacklisted(c, seq, false))
seq++;
if (seq == le64_to_cpu(i->j.seq))
break;
missing_start = seq;
while (seq < le64_to_cpu(i->j.seq) &&
!bch2_journal_seq_is_blacklisted(c, seq, false))
seq++;
missing_end = seq - 1;
fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
missing_start, missing_end,
last_seq, *blacklist_seq - 1);
}
seq++;
}
list_for_each_entry(i, list, list) {
struct jset_entry *entry;
struct bkey_i *k, *_n;
struct bch_replicas_padded replicas;
char buf[80];
if (i->ignore)
continue;
ret = jset_validate_entries(c, &i->j, READ);
if (ret)
goto fsck_err;
@ -768,12 +872,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
entries++;
}
if (!list_empty(list)) {
i = list_last_entry(list, struct journal_replay, list);
bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
keys, entries, *start_seq);
bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
keys, entries, le64_to_cpu(i->j.seq));
}
if (*start_seq != *blacklist_seq)
bch_info(c, "dropped unflushed entries %llu-%llu",
*blacklist_seq, *start_seq - 1);
fsck_err:
return ret;
}
@ -951,16 +1055,23 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
buf->buf_size = new_size;
}
static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
{
return j->buf + j->reservations.unwritten_idx;
}
static void journal_write_done(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *w = journal_prev_buf(j);
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_devs_list devs =
bch2_bkey_devs(bkey_i_to_s_c(&w->key));
struct bch_replicas_padded replicas;
union journal_res_state old, new;
u64 seq = le64_to_cpu(w->data->seq);
u64 last_seq = le64_to_cpu(w->data->last_seq);
u64 v;
int err = 0;
bch2_time_stats_update(j->write_time, j->write_start_time);
@ -984,8 +1095,12 @@ static void journal_write_done(struct closure *cl)
j->seq_ondisk = seq;
if (err && (!j->err_seq || seq < j->err_seq))
j->err_seq = seq;
j->last_seq_ondisk = last_seq;
bch2_journal_space_available(j);
if (!w->noflush) {
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = last_seq;
bch2_journal_space_available(j);
}
/*
* Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
@ -999,9 +1114,14 @@ static void journal_write_done(struct closure *cl)
/* also must come before signalling write completion: */
closure_debug_destroy(cl);
BUG_ON(!j->reservations.prev_buf_unwritten);
atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
&j->reservations.counter);
v = atomic64_read(&j->reservations.counter);
do {
old.v = new.v = v;
BUG_ON(new.idx == new.unwritten_idx);
new.unwritten_idx++;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
closure_wake_up(&w->wait);
journal_wake(j);
@ -1009,6 +1129,10 @@ static void journal_write_done(struct closure *cl)
if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
mod_delayed_work(system_freezable_wq, &j->write_work, 0);
spin_unlock(&j->lock);
if (new.unwritten_idx != new.idx &&
!journal_state_count(new, new.unwritten_idx))
closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
}
static void journal_write_endio(struct bio *bio)
@ -1016,10 +1140,10 @@ static void journal_write_endio(struct bio *bio)
struct bch_dev *ca = bio->bi_private;
struct journal *j = &ca->fs->journal;
if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s",
if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s",
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("journal")) {
struct journal_buf *w = journal_prev_buf(j);
struct journal_buf *w = journal_last_unwritten_buf(j);
unsigned long flags;
spin_lock_irqsave(&j->err_lock, flags);
@ -1036,7 +1160,7 @@ void bch2_journal_write(struct closure *cl)
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_prev_buf(j);
struct journal_buf *w = journal_last_unwritten_buf(j);
struct jset_entry *start, *end;
struct jset *jset;
struct bio *bio;
@ -1047,13 +1171,27 @@ void bch2_journal_write(struct closure *cl)
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
journal_buf_realloc(j, w);
jset = w->data;
j->write_start_time = local_clock();
spin_lock(&j->lock);
if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
!w->must_flush &&
(jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) &&
test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
w->noflush = true;
SET_JSET_NO_FLUSH(jset, true);
jset->last_seq = cpu_to_le64(j->last_seq_ondisk);
j->nr_noflush_writes++;
} else {
j->last_flush_write = jiffies;
j->nr_flush_writes++;
}
spin_unlock(&j->lock);
/*
* New btree roots are set by journalling them; when the journal entry
* gets written we have to propagate them to c->btree_roots
@ -1175,8 +1313,9 @@ retry_alloc:
bio->bi_iter.bi_sector = ptr->offset;
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
bio_set_op_attrs(bio, REQ_OP_WRITE,
REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META;
if (!JSET_NO_FLUSH(jset))
bio->bi_opf |= REQ_PREFLUSH|REQ_FUA;
bch2_bio_map(bio, jset, sectors << 9);
trace_journal_write(bio);
@ -1185,20 +1324,21 @@ retry_alloc:
ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
}
for_each_rw_member(ca, c, i)
if (journal_flushes_device(ca) &&
!bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
percpu_ref_get(&ca->io_ref);
bio = ca->journal.bio;
bio_reset(bio);
bio_set_dev(bio, ca->disk_sb.bdev);
bio->bi_opf = REQ_OP_FLUSH;
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
closure_bio_submit(bio, cl);
}
if (!JSET_NO_FLUSH(jset)) {
for_each_rw_member(ca, c, i)
if (journal_flushes_device(ca) &&
!bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
percpu_ref_get(&ca->io_ref);
bio = ca->journal.bio;
bio_reset(bio);
bio_set_dev(bio, ca->disk_sb.bdev);
bio->bi_opf = REQ_OP_FLUSH;
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
closure_bio_submit(bio, cl);
}
}
no_io:
bch2_bucket_seq_cleanup(c);

View File

@ -11,6 +11,7 @@ struct journal_replay {
struct bch_devs_list devs;
/* checksum error, but we may want to try using it anyways: */
bool bad;
bool ignore;
/* must be last: */
struct jset j;
};
@ -37,7 +38,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \
vstruct_for_each_safe(entry, k, _n)
int bch2_journal_read(struct bch_fs *, struct list_head *);
int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
void bch2_journal_write(struct closure *);

View File

@ -58,81 +58,107 @@ static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
old.v, new.v)) != old.v);
}
static struct journal_space {
unsigned next_entry;
unsigned remaining;
} __journal_space_available(struct journal *j, unsigned nr_devs_want,
static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx)
{
unsigned sectors = 0;
while (!sectors && *idx != j->reservations.idx) {
sectors = j->buf[*idx].sectors;
*idx = (*idx + 1) & JOURNAL_BUF_MASK;
}
return sectors;
}
static struct journal_space
journal_dev_space_available(struct journal *j, struct bch_dev *ca,
enum journal_space_from from)
{
struct journal_device *ja = &ca->journal;
unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx;
if (from == journal_space_total)
return (struct journal_space) {
.next_entry = ca->mi.bucket_size,
.total = ca->mi.bucket_size * ja->nr,
};
buckets = bch2_journal_dev_buckets_available(j, ja, from);
sectors = ja->sectors_free;
/*
* We that we don't allocate the space for a journal entry
* until we write it out - thus, account for it here:
*/
while ((unwritten = get_unwritten_sectors(j, &idx))) {
if (unwritten >= sectors) {
if (!buckets) {
sectors = 0;
break;
}
buckets--;
sectors = ca->mi.bucket_size;
}
sectors -= unwritten;
}
if (sectors < ca->mi.bucket_size && buckets) {
buckets--;
sectors = ca->mi.bucket_size;
}
return (struct journal_space) {
.next_entry = sectors,
.total = sectors + buckets * ca->mi.bucket_size,
};
}
static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want,
enum journal_space_from from)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
unsigned sectors_next_entry = UINT_MAX;
unsigned sectors_total = UINT_MAX;
unsigned i, nr_devs = 0;
unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
? journal_prev_buf(j)->sectors
: 0;
unsigned i, pos, nr_devs = 0;
struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
rcu_read_lock();
for_each_member_device_rcu(ca, c, i,
&c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
unsigned buckets_this_device, sectors_this_device;
if (!ja->nr)
if (!ca->journal.nr)
continue;
buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from);
sectors_this_device = ja->sectors_free;
/*
* We that we don't allocate the space for a journal entry
* until we write it out - thus, account for it here:
*/
if (unwritten_sectors >= sectors_this_device) {
if (!buckets_this_device)
continue;
buckets_this_device--;
sectors_this_device = ca->mi.bucket_size;
}
sectors_this_device -= unwritten_sectors;
if (sectors_this_device < ca->mi.bucket_size &&
buckets_this_device) {
buckets_this_device--;
sectors_this_device = ca->mi.bucket_size;
}
if (!sectors_this_device)
space = journal_dev_space_available(j, ca, from);
if (!space.next_entry)
continue;
sectors_next_entry = min(sectors_next_entry,
sectors_this_device);
for (pos = 0; pos < nr_devs; pos++)
if (space.total > dev_space[pos].total)
break;
sectors_total = min(sectors_total,
buckets_this_device * ca->mi.bucket_size +
sectors_this_device);
nr_devs++;
array_insert_item(dev_space, nr_devs, pos, space);
}
rcu_read_unlock();
if (nr_devs < nr_devs_want)
return (struct journal_space) { 0, 0 };
return (struct journal_space) {
.next_entry = sectors_next_entry,
.remaining = max_t(int, 0, sectors_total - sectors_next_entry),
};
/*
* We sorted largest to smallest, and we want the smallest out of the
* @nr_devs_want largest devices:
*/
return dev_space[nr_devs_want - 1];
}
void bch2_journal_space_available(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_space discarded, clean_ondisk, clean;
unsigned clean, clean_ondisk, total;
unsigned overhead, u64s_remaining = 0;
unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
j->buf[1].buf_size >> 9);
@ -173,27 +199,33 @@ void bch2_journal_space_available(struct journal *j)
goto out;
}
if (!fifo_free(&j->pin)) {
ret = cur_entry_journal_pin_full;
goto out;
}
nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
discarded = __journal_space_available(j, nr_devs_want, journal_space_discarded);
clean_ondisk = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk);
clean = __journal_space_available(j, nr_devs_want, journal_space_clean);
for (i = 0; i < journal_space_nr; i++)
j->space[i] = __journal_space_available(j, nr_devs_want, i);
if (!discarded.next_entry)
clean_ondisk = j->space[journal_space_clean_ondisk].total;
clean = j->space[journal_space_clean].total;
total = j->space[journal_space_total].total;
if (!j->space[journal_space_discarded].next_entry)
ret = cur_entry_journal_full;
else if (!fifo_free(&j->pin))
ret = cur_entry_journal_pin_full;
overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
if ((clean - clean_ondisk <= total / 8) &&
(clean_ondisk * 2 > clean ))
set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
else
clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
overhead = DIV_ROUND_UP(clean, max_entry_size) *
journal_entry_overhead(j);
u64s_remaining = clean.remaining << 6;
u64s_remaining = clean << 6;
u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
u64s_remaining /= 4;
out:
j->cur_entry_sectors = !ret ? discarded.next_entry : 0;
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_error = ret;
journal_set_remaining(j, u64s_remaining);
journal_check_may_get_unreserved(j);
@ -277,6 +309,14 @@ static void bch2_journal_reclaim_fast(struct journal *j)
bch2_journal_space_available(j);
}
void __bch2_journal_pin_put(struct journal *j, u64 seq)
{
struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
if (atomic_dec_and_test(&pin_list->count))
bch2_journal_reclaim_fast(j);
}
void bch2_journal_pin_put(struct journal *j, u64 seq)
{
struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
@ -485,13 +525,14 @@ static u64 journal_seq_to_flush(struct journal *j)
* 512 journal entries or 25% of all journal buckets, then
* journal_next_bucket() should not stall.
*/
static void __bch2_journal_reclaim(struct journal *j, bool direct)
static int __bch2_journal_reclaim(struct journal *j, bool direct)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
bool kthread = (current->flags & PF_KTHREAD) != 0;
u64 seq_to_flush, nr_flushed = 0;
size_t min_nr;
unsigned flags;
int ret = 0;
/*
* We can't invoke memory reclaim while holding the reclaim_lock -
@ -506,6 +547,11 @@ static void __bch2_journal_reclaim(struct journal *j, bool direct)
if (kthread && kthread_should_stop())
break;
if (bch2_journal_error(j)) {
ret = -EIO;
break;
}
bch2_journal_do_discards(j);
seq_to_flush = journal_seq_to_flush(j);
@ -547,27 +593,30 @@ static void __bch2_journal_reclaim(struct journal *j, bool direct)
} while (min_nr);
memalloc_noreclaim_restore(flags);
return ret;
}
void bch2_journal_reclaim(struct journal *j)
int bch2_journal_reclaim(struct journal *j)
{
__bch2_journal_reclaim(j, true);
return __bch2_journal_reclaim(j, true);
}
static int bch2_journal_reclaim_thread(void *arg)
{
struct journal *j = arg;
unsigned long next;
int ret = 0;
set_freezable();
kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
while (!kthread_should_stop()) {
while (!ret && !kthread_should_stop()) {
j->reclaim_kicked = false;
mutex_lock(&j->reclaim_lock);
__bch2_journal_reclaim(j, false);
ret = __bch2_journal_reclaim(j, false);
mutex_unlock(&j->reclaim_lock);
next = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);

View File

@ -4,12 +4,6 @@
#define JOURNAL_PIN (32 * 1024)
enum journal_space_from {
journal_space_discarded,
journal_space_clean_ondisk,
journal_space_clean,
};
static inline void journal_reclaim_kick(struct journal *j)
{
struct task_struct *p = READ_ONCE(j->reclaim_thread);
@ -39,6 +33,7 @@ journal_seq_pin(struct journal *j, u64 seq)
return &j->pin.data[seq & j->pin.mask];
}
void __bch2_journal_pin_put(struct journal *, u64);
void bch2_journal_pin_put(struct journal *, u64);
void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
@ -73,7 +68,7 @@ static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
void bch2_journal_do_discards(struct journal *);
void bch2_journal_reclaim(struct journal *);
int bch2_journal_reclaim(struct journal *);
void bch2_journal_reclaim_stop(struct journal *);
int bch2_journal_reclaim_start(struct journal *);

View File

@ -118,7 +118,7 @@ out_write_sb:
out:
mutex_unlock(&c->sb_lock);
return ret;
return ret ?: bch2_blacklist_table_initialize(c);
}
static int journal_seq_blacklist_table_cmp(const void *_l,
@ -164,8 +164,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
struct journal_seq_blacklist_table *t;
unsigned i, nr = blacklist_nr_entries(bl);
BUG_ON(c->journal_seq_blacklist_table);
if (!bl)
return 0;
@ -187,6 +185,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
journal_seq_blacklist_table_cmp,
NULL);
kfree(c->journal_seq_blacklist_table);
c->journal_seq_blacklist_table = t;
return 0;
}

View File

@ -9,11 +9,13 @@
#include "super_types.h"
#include "fifo.h"
struct journal_res;
#define JOURNAL_BUF_BITS 2
#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS)
#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1)
/*
* We put two of these in struct journal; we used them for writes to the
* journal that are being staged or in flight.
* We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
* the journal that are being staged or in flight.
*/
struct journal_buf {
struct jset *data;
@ -27,6 +29,8 @@ struct journal_buf {
unsigned disk_sectors; /* maximum size entry could have been, if
buf_size was bigger */
unsigned u64s_reserved;
bool noflush; /* write has already been kicked off, and was noflush */
bool must_flush; /* something wants a flush */
/* bloom filter: */
unsigned long has_inode[1024 / sizeof(unsigned long)];
};
@ -81,10 +85,12 @@ union journal_res_state {
struct {
u64 cur_entry_offset:20,
idx:1,
prev_buf_unwritten:1,
buf0_count:21,
buf1_count:21;
idx:2,
unwritten_idx:2,
buf0_count:10,
buf1_count:10,
buf2_count:10,
buf3_count:10;
};
};
@ -116,6 +122,20 @@ union journal_preres_state {
#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1)
#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX)
struct journal_space {
/* Units of 512 bytes sectors: */
unsigned next_entry; /* How big the next journal entry can be */
unsigned total;
};
enum journal_space_from {
journal_space_discarded,
journal_space_clean_ondisk,
journal_space_clean,
journal_space_total,
journal_space_nr,
};
/*
* JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
* either because something's waiting on the write to complete or because it's
@ -128,6 +148,7 @@ enum {
JOURNAL_RECLAIM_STARTED,
JOURNAL_NEED_WRITE,
JOURNAL_MAY_GET_UNRESERVED,
JOURNAL_MAY_SKIP_FLUSH,
};
/* Embedded in struct bch_fs */
@ -165,7 +186,7 @@ struct journal {
* Two journal entries -- one is currently open for new entries, the
* other is possibly being written out.
*/
struct journal_buf buf[2];
struct journal_buf buf[JOURNAL_BUF_NR];
spinlock_t lock;
@ -185,6 +206,7 @@ struct journal {
/* seq, last_seq from the most recent journal entry successfully written */
u64 seq_ondisk;
u64 flushed_seq_ondisk;
u64 last_seq_ondisk;
u64 err_seq;
u64 last_empty_seq;
@ -210,6 +232,8 @@ struct journal {
struct journal_entry_pin_list *data;
} pin;
struct journal_space space[journal_space_nr];
u64 replay_journal_seq;
u64 replay_journal_seq_end;
@ -232,11 +256,15 @@ struct journal {
unsigned write_delay_ms;
unsigned reclaim_delay_ms;
unsigned long last_flush_write;
u64 res_get_blocked_start;
u64 need_write_time;
u64 write_start_time;
u64 nr_flush_writes;
u64 nr_noflush_writes;
struct time_stats *write_time;
struct time_stats *delay_time;
struct time_stats *blocked_time;

View File

@ -313,7 +313,7 @@ void bch2_journal_keys_free(struct journal_keys *keys)
static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
{
struct journal_replay *p;
struct journal_replay *i;
struct jset_entry *entry;
struct bkey_i *k, *_n;
struct journal_keys keys = { NULL };
@ -323,35 +323,35 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
if (list_empty(journal_entries))
return keys;
keys.journal_seq_base =
le64_to_cpu(list_last_entry(journal_entries,
struct journal_replay, list)->j.last_seq);
list_for_each_entry(p, journal_entries, list) {
if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
list_for_each_entry(i, journal_entries, list) {
if (i->ignore)
continue;
for_each_jset_key(k, _n, entry, &p->j)
if (!keys.journal_seq_base)
keys.journal_seq_base = le64_to_cpu(i->j.seq);
for_each_jset_key(k, _n, entry, &i->j)
nr_keys++;
}
keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
if (!keys.d)
goto err;
list_for_each_entry(p, journal_entries, list) {
if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
list_for_each_entry(i, journal_entries, list) {
if (i->ignore)
continue;
for_each_jset_key(k, _n, entry, &p->j)
BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX);
for_each_jset_key(k, _n, entry, &i->j)
keys.d[keys.nr++] = (struct journal_key) {
.btree_id = entry->btree_id,
.level = entry->level,
.k = k,
.journal_seq = le64_to_cpu(p->j.seq) -
.journal_seq = le64_to_cpu(i->j.seq) -
keys.journal_seq_base,
.journal_offset = k->_data - p->j._data,
.journal_offset = k->_data - i->j._data,
};
}
@ -643,46 +643,6 @@ err:
return ret;
}
static bool journal_empty(struct list_head *journal)
{
return list_empty(journal) ||
journal_entry_empty(&list_last_entry(journal,
struct journal_replay, list)->j);
}
static int
verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
struct list_head *journal)
{
struct journal_replay *i =
list_last_entry(journal, struct journal_replay, list);
u64 start_seq = le64_to_cpu(i->j.last_seq);
u64 end_seq = le64_to_cpu(i->j.seq);
u64 seq = start_seq;
int ret = 0;
list_for_each_entry(i, journal, list) {
if (le64_to_cpu(i->j.seq) < start_seq)
continue;
fsck_err_on(seq != le64_to_cpu(i->j.seq), c,
"journal entries %llu-%llu missing! (replaying %llu-%llu)",
seq, le64_to_cpu(i->j.seq) - 1,
start_seq, end_seq);
seq = le64_to_cpu(i->j.seq);
fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c,
"found blacklisted journal entry %llu", seq);
do {
seq++;
} while (bch2_journal_seq_is_blacklisted(c, seq, false));
}
fsck_err:
return ret;
}
/* journal replay early: */
static int journal_replay_entry_early(struct bch_fs *c,
@ -767,6 +727,7 @@ static int journal_replay_early(struct bch_fs *c,
struct bch_sb_field_clean *clean,
struct list_head *journal)
{
struct journal_replay *i;
struct jset_entry *entry;
int ret;
@ -782,18 +743,19 @@ static int journal_replay_early(struct bch_fs *c,
return ret;
}
} else {
struct journal_replay *i =
list_last_entry(journal, struct journal_replay, list);
list_for_each_entry(i, journal, list) {
if (i->ignore)
continue;
c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
list_for_each_entry(i, journal, list)
vstruct_for_each(&i->j, entry) {
ret = journal_replay_entry_early(c, entry);
if (ret)
return ret;
}
}
}
bch2_fs_usage_initialize(c);
@ -842,9 +804,6 @@ static int verify_superblock_clean(struct bch_fs *c,
struct bch_sb_field_clean *clean = *cleanp;
int ret = 0;
if (!c->sb.clean || !j)
return 0;
if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
le64_to_cpu(clean->journal_seq),
@ -971,7 +930,8 @@ int bch2_fs_recovery(struct bch_fs *c)
{
const char *err = "cannot allocate memory";
struct bch_sb_field_clean *clean = NULL;
u64 journal_seq;
struct jset *last_journal_entry = NULL;
u64 blacklist_seq, journal_seq;
bool write_sb = false, need_write_alloc = false;
int ret;
@ -991,24 +951,38 @@ int bch2_fs_recovery(struct bch_fs *c)
set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
}
if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
struct jset *j;
ret = bch2_blacklist_table_initialize(c);
if (ret) {
bch_err(c, "error initializing blacklist table");
goto err;
}
ret = bch2_journal_read(c, &c->journal_entries);
if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
struct journal_replay *i;
ret = bch2_journal_read(c, &c->journal_entries,
&blacklist_seq, &journal_seq);
if (ret)
goto err;
if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c,
list_for_each_entry_reverse(i, &c->journal_entries, list)
if (!i->ignore) {
last_journal_entry = &i->j;
break;
}
if (mustfix_fsck_err_on(c->sb.clean &&
last_journal_entry &&
!journal_entry_empty(last_journal_entry), c,
"filesystem marked clean but journal not empty")) {
c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->sb.clean = false;
}
if (!c->sb.clean && list_empty(&c->journal_entries)) {
bch_err(c, "no journal entries found");
ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
goto err;
if (!last_journal_entry) {
fsck_err_on(!c->sb.clean, c, "no journal entries found");
goto use_clean;
}
c->journal_keys = journal_keys_sort(&c->journal_entries);
@ -1017,16 +991,21 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
j = &list_last_entry(&c->journal_entries,
struct journal_replay, list)->j;
ret = verify_superblock_clean(c, &clean, j);
if (ret)
if (c->sb.clean && last_journal_entry) {
ret = verify_superblock_clean(c, &clean,
last_journal_entry);
if (ret)
goto err;
}
} else {
use_clean:
if (!clean) {
bch_err(c, "no superblock clean section found");
ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
goto err;
journal_seq = le64_to_cpu(j->seq) + 1;
} else {
journal_seq = le64_to_cpu(clean->journal_seq) + 1;
}
blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
}
if (!c->sb.clean &&
@ -1045,30 +1024,23 @@ int bch2_fs_recovery(struct bch_fs *c)
if (ret)
goto err;
if (!c->sb.clean) {
/*
* After an unclean shutdown, skip then next few journal sequence
* numbers as they may have been referenced by btree writes that
* happened before their corresponding journal writes - those btree
* writes need to be ignored, by skipping and blacklisting the next few
* journal sequence numbers:
*/
if (!c->sb.clean)
journal_seq += 8;
if (blacklist_seq != journal_seq) {
ret = bch2_journal_seq_blacklist_add(c,
journal_seq,
journal_seq + 4);
blacklist_seq, journal_seq);
if (ret) {
bch_err(c, "error creating new journal seq blacklist entry");
goto err;
}
journal_seq += 4;
/*
* The superblock needs to be written before we do any btree
* node writes: it will be in the read_write() path
*/
}
ret = bch2_blacklist_table_initialize(c);
if (!list_empty(&c->journal_entries)) {
ret = verify_journal_entries_not_blacklisted_or_missing(c,
&c->journal_entries);
if (ret)
goto err;
}
ret = bch2_fs_journal_start(&c->journal, journal_seq,

View File

@ -275,53 +275,55 @@ static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
static int replicas_table_update(struct bch_fs *c,
struct bch_replicas_cpu *new_r)
{
struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL };
struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
struct bch_fs_usage *new_scratch = NULL;
struct bch_fs_usage __percpu *new_gc = NULL;
struct bch_fs_usage *new_base = NULL;
unsigned bytes = sizeof(struct bch_fs_usage) +
unsigned i, bytes = sizeof(struct bch_fs_usage) +
sizeof(u64) * new_r->nr;
int ret = -ENOMEM;
int ret = 0;
memset(new_usage, 0, sizeof(new_usage));
for (i = 0; i < ARRAY_SIZE(new_usage); i++)
if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
sizeof(u64), GFP_NOIO)))
goto err;
if (!(new_base = kzalloc(bytes, GFP_NOIO)) ||
!(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64),
GFP_NOIO)) ||
!(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64),
GFP_NOIO)) ||
!(new_scratch = kmalloc(bytes, GFP_NOIO)) ||
(c->usage_gc &&
!(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) {
bch_err(c, "error updating replicas table: memory allocation failure");
!(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO))))
goto err;
}
for (i = 0; i < ARRAY_SIZE(new_usage); i++)
if (c->usage[i])
__replicas_table_update_pcpu(new_usage[i], new_r,
c->usage[i], &c->replicas);
if (c->usage_base)
__replicas_table_update(new_base, new_r,
c->usage_base, &c->replicas);
if (c->usage[0])
__replicas_table_update_pcpu(new_usage[0], new_r,
c->usage[0], &c->replicas);
if (c->usage[1])
__replicas_table_update_pcpu(new_usage[1], new_r,
c->usage[1], &c->replicas);
if (c->usage_gc)
__replicas_table_update_pcpu(new_gc, new_r,
c->usage_gc, &c->replicas);
for (i = 0; i < ARRAY_SIZE(new_usage); i++)
swap(c->usage[i], new_usage[i]);
swap(c->usage_base, new_base);
swap(c->usage[0], new_usage[0]);
swap(c->usage[1], new_usage[1]);
swap(c->usage_scratch, new_scratch);
swap(c->usage_gc, new_gc);
swap(c->replicas, *new_r);
ret = 0;
err:
out:
free_percpu(new_gc);
kfree(new_scratch);
free_percpu(new_usage[1]);
free_percpu(new_usage[0]);
kfree(new_base);
return ret;
err:
bch_err(c, "error updating replicas table: memory allocation failure");
ret = -ENOMEM;
goto out;
}
static unsigned reserve_journal_replicas(struct bch_fs *c,
@ -496,9 +498,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
struct bch_replicas_cpu n;
if (!__replicas_has_entry(&c->replicas_gc, e) &&
(c->usage_base->replicas[i] ||
percpu_u64_get(&c->usage[0]->replicas[i]) ||
percpu_u64_get(&c->usage[1]->replicas[i]))) {
bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) {
n = cpu_replicas_add_entry(&c->replicas_gc, e);
if (!n.entries) {
ret = -ENOSPC;
@ -603,9 +603,7 @@ retry:
cpu_replicas_entry(&c->replicas, i);
if (e->data_type == BCH_DATA_journal ||
c->usage_base->replicas[i] ||
percpu_u64_get(&c->usage[0]->replicas[i]) ||
percpu_u64_get(&c->usage[1]->replicas[i]))
bch2_fs_usage_read_one(c, &c->usage_base->replicas[i]))
memcpy(cpu_replicas_entry(&new, new.nr++),
e, new.entry_size);
}

View File

@ -636,7 +636,7 @@ static void write_super_endio(struct bio *bio)
/* XXX: return errors directly */
if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s",
if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s",
bch2_blk_status_to_str(bio->bi_status)))
ca->sb_write_error = 1;
@ -995,10 +995,10 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
percpu_down_write(&c->mark_lock);
if (!journal_seq) {
bch2_fs_usage_acc_to_base(c, 0);
bch2_fs_usage_acc_to_base(c, 1);
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
} else {
bch2_fs_usage_acc_to_base(c, journal_seq & 1);
bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
}
{

View File

@ -475,8 +475,8 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_journal_entries_free(&c->journal_entries);
percpu_free_rwsem(&c->mark_lock);
kfree(c->usage_scratch);
free_percpu(c->usage[1]);
free_percpu(c->usage[0]);
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
free_percpu(c->usage[i]);
kfree(c->usage_base);
if (c->btree_iters_bufs)
@ -716,6 +716,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_btree_cache_init_early(&c->btree_cache);
mutex_init(&c->sectors_available_lock);
if (percpu_init_rwsem(&c->mark_lock))
goto err;